[MaskRCNN] update docs; add SyncBN option

0b2f3c11 · Yuxin Wu · a3581e79 · 0b2f3c11 · 0b2f3c11 · 0b2f3c11
Commit 0b2f3c11 authored Jun 29, 2018 by Yuxin Wu
6 changed files
--- a/examples/FasterRCNN/NOTES.md
+++ b/examples/FasterRCNN/NOTES.md
@@ -35,20 +35,21 @@ Model:

 3. We only support single image per GPU.

-4. Because of (3), BatchNorm statistics are not supposed to be updated during fine-tuning.
+4. Because of (3), BatchNorm statistics are supposed to be freezed during fine-tuning.
   This specific kind of BatchNorm will need [my kernel](https://github.com/tensorflow/tensorflow/pull/12580)
-   which is included since TF 1.4. If using an earlier version of TF, it will be either slow or wrong.
+   which is included since TF 1.4.
+   
+5. An alternative to freezing BatchNorm is to sync BatchNorm statistics across
+   GPUs (the `BACKBONE.NORM=SyncBN` option). This would require [my bugfix](https://github.com/tensorflow/tensorflow/pull/20360)
+   which will probably be in TF 1.10. You can manually apply the patch to use it.
+   For now the total batch size is at most 8, so this option does not improve the model by much.

 Speed:

 1. The training will start very slow due to convolution warmup, until about 10k steps to reach a maximum speed.
-	 Then the training speed will slowly decrease due to more accurate proposals.
+   Then the training speed will slowly decrease due to more accurate proposals.

-2. Inference is not quite fast, because either you disable convolution autotune and end up with
-	 a slow convolution algorithm, or you spend more time on autotune.
-	 This is a general problem of TensorFlow when running against variable-sized input.
-
-3. This implementation is about 14% slower than detectron,
+2. This implementation is about 14% slower than detectron,
   probably due to the lack of specialized ops (e.g. AffineChannel, ROIAlign) in TensorFlow.
   It's certainly faster than other TF implementation.


--- a/examples/FasterRCNN/README.md
+++ b/examples/FasterRCNN/README.md
@@ -11,7 +11,7 @@ This example provides a minimal (only 1.6k lines) and faithful implementation of
 + Pre-trained [ImageNet ResNet model](http://models.tensorpack.com/ResNet/) from tensorpack model zoo.
 + COCO data. It needs to have the following directory structure:
 ```
-DIR/
+COCO/DIR/
  annotations/
    instances_train2014.json
    instances_val2014.json
@@ -27,27 +27,31 @@ DIR/


 ## Usage
-Change config in `config.py`:
-1. Change `BASEDIR` to `/path/to/DIR` as described above.
-2. Change `MODE_MASK`/`MODE_FPN`, or other options you like. Recommended configurations are listed in the table below.
-
-Train:
+To train:
 ```
-./train.py --load /path/to/ImageNet-ResNet50.npz
+./train.py --config \
+    MODE_MASK=True MODE_FPN=True \
+    DATA.BASEDIR=/path/to/COCO/DIR \
+    BACKBONE.WEIGHTS=/path/to/ImageNet-ResNet50.npz \
 ```
+Options can be changed by either the command line or the `config.py` file. 
+Recommended configurations are listed in the table below.
+
 The code is only valid for training with 1, 2, 4 or 8 GPUs.
 Not training with 8 GPUs may result in different performance from the table below.

-Predict on an image (and show output in a window):
+To predict on an image (and show output in a window):
 ```
-./train.py --predict input.jpg --load /path/to/model
+./train.py --predict input.jpg --load /path/to/model --config SAME-AS-TRAINING
 ```

 Evaluate the performance of a model on COCO, and save results to json.
 (Trained COCO models can be downloaded in [model zoo](http://models.tensorpack.com/FasterRCNN):
 ```
-./train.py --evaluate output.json --load /path/to/COCO-ResNet50-MaskRCNN.npz
+./train.py --evaluate output.json --load /path/to/COCO-ResNet50-MaskRCNN.npz \
+    --config MODE_MASK=True DATA.BASEDIR=/path/to/COCO/DIR
 ```
+Evaluation or prediction will need the same config used during training.

 ## Results


--- a/examples/FasterRCNN/basemodel.py
+++ b/examples/FasterRCNN/basemodel.py
@@ -36,6 +36,15 @@ def resnet_argscope():
        yield


+@contextmanager
+def maybe_syncbn_scope():
+    if cfg.BACKBONE.NORM == 'SyncBN':
+        with argscope(BatchNorm, training=None, sync_statistics='nccl'):
+            yield
+    else:
+        yield
+
+
 def image_preprocess(image, bgr=True):
    with tf.name_scope('image_preprocess'):
        if image.dtype.base_dtype != tf.float32:
@@ -107,15 +116,16 @@ def resnet_c4_backbone(image, num_blocks, freeze_c2=True):
        # TODO replace var by const to enable optimization
        if freeze_c2:
            c2 = tf.stop_gradient(c2)
-        c3 = resnet_group('group1', c2, resnet_bottleneck, 128, num_blocks[1], 2)
-        c4 = resnet_group('group2', c3, resnet_bottleneck, 256, num_blocks[2], 2)
+        with maybe_syncbn_scope():
+            c3 = resnet_group('group1', c2, resnet_bottleneck, 128, num_blocks[1], 2)
+            c4 = resnet_group('group2', c3, resnet_bottleneck, 256, num_blocks[2], 2)
    # 16x downsampling up to now
    return c4


 @auto_reuse_variable_scope
 def resnet_conv5(image, num_block):
-    with resnet_argscope():
+    with resnet_argscope(), maybe_syncbn_scope():
        l = resnet_group('group3', image, resnet_bottleneck, 512, num_block, 2)
        return l

@@ -140,9 +150,10 @@ def resnet_fpn_backbone(image, num_blocks, freeze_c2=True):
        c2 = resnet_group('group0', l, resnet_bottleneck, 64, num_blocks[0], 1)
        if freeze_c2:
            c2 = tf.stop_gradient(c2)
-        c3 = resnet_group('group1', c2, resnet_bottleneck, 128, num_blocks[1], 2)
-        c4 = resnet_group('group2', c3, resnet_bottleneck, 256, num_blocks[2], 2)
-        c5 = resnet_group('group3', c4, resnet_bottleneck, 512, num_blocks[3], 2)
+        with maybe_syncbn_scope():
+            c3 = resnet_group('group1', c2, resnet_bottleneck, 128, num_blocks[1], 2)
+            c4 = resnet_group('group2', c3, resnet_bottleneck, 256, num_blocks[2], 2)
+            c5 = resnet_group('group3', c4, resnet_bottleneck, 512, num_blocks[3], 2)
    # 32x downsampling up to now
    # size of c5: ceil(input/32)
    return c2, c3, c4, c5
--- a/examples/FasterRCNN/config.py
+++ b/examples/FasterRCNN/config.py
@@ -60,9 +60,11 @@ _C.DATA.NUM_CATEGORY = 80    # 80 categories
 _C.DATA.CLASS_NAMES = []  # NUM_CLASS strings. Needs to be populated later by data loader

 # basemodel ----------------------
+_C.BACKBONE.WEIGHTS = '/path/to/ImageNet-ResNet50.npz'
 _C.BACKBONE.RESNET_NUM_BLOCK = [3, 4, 6, 3]     # for resnet50
 # RESNET_NUM_BLOCK = [3, 4, 23, 3]    # for resnet101
 _C.BACKBONE.FREEZE_AFFINE = False   # do not train affine parameters inside BN
+_C.BACKBONE.NORM = 'FreezeBN'  # options: FreezeBN, SyncBN

 # Use a base model with TF-preferred padding mode,
 # which may pad more pixels on right/bottom than top/left.
@@ -146,6 +148,11 @@ def finalize_configs(is_training):
    Run some sanity checks, and populate some configs from others
    """
    _C.DATA.NUM_CLASS = _C.DATA.NUM_CATEGORY + 1  # +1 background
+
+    assert _C.BACKBONE.NORM in ['FreezeBN', 'SyncBN'], _C.BACKBONE.NORM
+    if _C.BACKBONE.NORM != 'FreezeBN':
+        assert not _C.BACKBONE.FREEZE_AFFINE
+
    _C.RPN.NUM_ANCHOR = len(_C.RPN.ANCHOR_SIZES) * len(_C.RPN.ANCHOR_RATIOS)
    assert len(_C.FPN.ANCHOR_STRIDES) == len(_C.RPN.ANCHOR_SIZES)
    # image size into the backbone has to be multiple of this number

--- a/examples/FasterRCNN/train.py
+++ b/examples/FasterRCNN/train.py
@@ -532,15 +532,15 @@ class EvalCallback(Callback):

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
-    parser.add_argument('--load', help='load a model for evaluation or training')
+    parser.add_argument('--load', help='load a model for evaluation or training. Can overwrite BACKBONE.WEIGHTS')
    parser.add_argument('--logdir', help='log directory', default='train_log/maskrcnn')
-    parser.add_argument('--config', help="A list of KEY=VALUE to overwrite those defined in config.py",
-                        nargs='+')
    parser.add_argument('--visualize', action='store_true', help='visualize intermediate results')
    parser.add_argument('--evaluate', help="Run evaluation on COCO. "
                                           "This argument is the path to the output json evaluation file")
    parser.add_argument('--predict', help="Run prediction on a given image. "
                                          "This argument is the path to the input image file")
+    parser.add_argument('--config', help="A list of KEY=VALUE to overwrite those defined in config.py",
+                        nargs='+')

    if get_tf_version_number() < 1.6:
        # https://github.com/tensorflow/tensorflow/issues/14657
@@ -613,13 +613,18 @@ if __name__ == '__main__':
        if not is_horovod:
            callbacks.append(GPUUtilizationTracker())

+        if args.load:
+            session_init = get_model_loader(args.load)
+        else:
+            session_init = get_model_loader(cfg.BACKBONE.WEIGHTS) if cfg.BACKBONE.WEIGHTS else None
+
        traincfg = TrainConfig(
            model=MODEL,
            data=QueueInput(get_train_dataflow()),
            callbacks=callbacks,
            steps_per_epoch=stepnum,
            max_epoch=cfg.TRAIN.LR_SCHEDULE[-1] * factor // stepnum,
-            session_init=get_model_loader(args.load) if args.load else None,
+            session_init=session_init,
        )
        if is_horovod:
            # horovod mode has the best speed for this model

--- a/tensorpack/train/trainers.py
+++ b/tensorpack/train/trainers.py
@@ -304,7 +304,10 @@ class HorovodTrainer(SingleCostTrainer):
        # There are other MPI options that can potentially improve performance especially on special hardwares.

    Note:
-        1. Due to a TF bug, you must not initialize CUDA context before training.
+        1. There are several options in Horovod installation and in MPI command line that can improve speed.
+           See Horovod docs for details.
+
+        2. Due to a TF bug, you must not initialize CUDA context before training.
           Therefore TF functions like `is_gpu_available()` or `list_local_devices()`
           must be avoided.

@@ -312,7 +315,9 @@ class HorovodTrainer(SingleCostTrainer):

        3. MPI sometimes fails to kill all processes. Be sure to check it afterwards.

-        4. Keep in mind that there is one process per GPU, therefore:
+        4. Keep in mind that there is one process running the script per GPU, therefore:
+
+           + Make sure your InputSource has reasonable randomness.

           + If your data processing is heavy, doing it in a separate dedicated process might be
             a better choice than doing them repeatedly in each process.