[MaskRCNN] misc small updates

c4a68c6c · Yuxin Wu · 847fae12 · c4a68c6c · c4a68c6c
Commit c4a68c6c authored Nov 21, 2018 by Yuxin Wu
Hide whitespace changes
Inline Side-by-side

Showing with 30 additions and 31 deletions

examples/FasterRCNN/config.py examples/FasterRCNN/config.py +18 -11

examples/FasterRCNN/train.py examples/FasterRCNN/train.py +12 -20

No files found.
--- a/examples/FasterRCNN/config.py
+++ b/examples/FasterRCNN/config.py
@@ -24,7 +24,8 @@ class AttrDict():

    def __setattr__(self, name, value):
        if self._freezed and name not in self.__dict__:
-            raise AttributeError("Cannot create new attribute!")
+            raise AttributeError(
+                "Config was freezed! Unknown config: {}".format(name))
        super().__setattr__(name, value)

    def __str__(self):
@@ -54,11 +55,11 @@ class AttrDict():
                v = eval(v)
            setattr(dic, key, v)

-    def freeze(self):
-        self._freezed = True
+    def freeze(self, freezed=True):
+        self._freezed = freezed
        for v in self.__dict__.values():
            if isinstance(v, AttrDict):
-                v.freeze()
+                v.freeze(freezed)

    # avoid silent bugs
    def __eq__(self, _):
@@ -95,7 +96,6 @@ _C.BACKBONE.FREEZE_AT = 2  # options: 0, 1, 2
 # Use a base model with TF-preferred padding mode,
 # which may pad more pixels on right/bottom than top/left.
 # See https://github.com/tensorflow/tensorflow/issues/18213
-
 # In tensorpack model zoo, ResNet models with TF_PAD_MODE=False are marked with "-AlignPadding".
 # All other models under `ResNet/` in the model zoo are using TF_PAD_MODE=True.
 # Using either one should probably give the same performance.
@@ -110,11 +110,16 @@ _C.TRAIN.BASE_LR = 1e-2  # defined for a total batch size of 8. Otherwise it wil
 _C.TRAIN.WARMUP = 1000   # in terms of iterations. This is not affected by #GPUs
 _C.TRAIN.STEPS_PER_EPOCH = 500

-# LR_SCHEDULE means "steps" only when total batch size is 8.
-# Otherwise the actual steps to decrease learning rate are computed from the schedule.
+# LR_SCHEDULE means equivalent steps when the total batch size is 8.
+# When the total bs!=8, the actual iterations to decrease learning rate, and
+# the base learning rate are computed from BASE_LR and LR_SCHEDULE.
 # Therefore, there is *no need* to modify the config if you only change the number of GPUs.
-# LR_SCHEDULE = [120000, 160000, 180000]  # "1x" schedule in detectron
-_C.TRAIN.LR_SCHEDULE = [240000, 320000, 360000]    # "2x" schedule in detectron
+
+# _C.TRAIN.LR_SCHEDULE = [120000, 160000, 180000]      # "1x" schedule in detectron
+_C.TRAIN.LR_SCHEDULE = [240000, 320000, 360000]      # "2x" schedule in detectron
+# Longer schedules for from-scratch training (https://arxiv.org/abs/1811.08883):
+# _C.TRAIN.LR_SCHEDULE = [960000, 1040000, 1080000]    # "6x" schedule in detectron
+# _C.TRAIN.LR_SCHEDULE = [1500000, 1580000, 1620000]   # "9x" schedule in detectron
 _C.TRAIN.EVAL_PERIOD = 25  # period (epochs) to run eva

 # preprocessing --------------------
@@ -167,8 +172,7 @@ _C.FPN.ANCHOR_STRIDES = (4, 8, 16, 32, 64)  # strides for each FPN level. Must b
 _C.FPN.PROPOSAL_MODE = 'Level'  # 'Level', 'Joint'
 _C.FPN.NUM_CHANNEL = 256
 _C.FPN.NORM = 'None'  # 'None', 'GN'
-# conv head and fc head are only used in FPN.
-# For C4 models, the head is C5
+# The head option is only used in FPN. For C4 models, the head is C5
 _C.FPN.FRCNN_HEAD_FUNC = 'fastrcnn_2fc_head'
 # choices: fastrcnn_2fc_head, fastrcnn_4conv1fc_{,gn_}head
 _C.FPN.FRCNN_CONV_HEAD_DIM = 256
@@ -192,11 +196,14 @@ _C.TEST.RESULT_SCORE_THRESH = 0.05
 _C.TEST.RESULT_SCORE_THRESH_VIS = 0.3   # only visualize confident results
 _C.TEST.RESULTS_PER_IM = 100

+_C.freeze()  # avoid typo / wrong config keys
+

 def finalize_configs(is_training):
    """
    Run some sanity checks, and populate some configs from others
    """
+    _C.freeze(False)  # populate new keys now
    _C.DATA.NUM_CLASS = _C.DATA.NUM_CATEGORY + 1  # +1 background
    _C.DATA.BASEDIR = os.path.expanduser(_C.DATA.BASEDIR)


--- a/examples/FasterRCNN/train.py
+++ b/examples/FasterRCNN/train.py
@@ -232,18 +232,9 @@ class ResNetFPNModel(DetectionModel):
            )   # NR_GT x height x width
        return ret

-    def slice_feature_and_anchors(self, image_shape2d, p23456, anchors):
+    def slice_feature_and_anchors(self, p23456, anchors):
        for i, stride in enumerate(cfg.FPN.ANCHOR_STRIDES):
            with tf.name_scope('FPN_slice_lvl{}'.format(i)):
-                if i < 3:
-                    # Images are padded for p5, which are too large for p2-p4.
-                    # This seems to have no effect on mAP.
-                    pi = p23456[i]
-                    target_shape = tf.to_int32(tf.ceil(tf.to_float(image_shape2d) * (1.0 / stride)))
-                    p23456[i] = tf.slice(pi, [0, 0, 0, 0],
-                                         tf.concat([[-1, -1], target_shape], axis=0))
-                    p23456[i].set_shape([1, pi.shape[1], None, None])
-
                anchors[i] = anchors[i].narrow_to(p23456[i])

    def backbone(self, image):
@@ -260,7 +251,7 @@ class ResNetFPNModel(DetectionModel):
            all_anchors_fpn[i],
            inputs['anchor_labels_lvl{}'.format(i + 2)],
            inputs['anchor_boxes_lvl{}'.format(i + 2)]) for i in range(len(all_anchors_fpn))]
-        self.slice_feature_and_anchors(image_shape2d, features, multilevel_anchors)
+        self.slice_feature_and_anchors(features, multilevel_anchors)

        # Multi-Level RPN Proposals
        rpn_outputs = [rpn_head('rpn', pi, cfg.FPN.NUM_CHANNEL, len(cfg.RPN.ANCHOR_RATIOS))
@@ -472,23 +463,24 @@ class EvalCallback(Callback):
                    futures.append(executor.submit(eval_coco, dataflow, pred, pbar))
                all_results = list(itertools.chain(*[fut.result() for fut in futures]))
        else:
+            filenames = [os.path.join(
+                logdir, 'outputs{}-part{}.json'.format(self.global_step, rank)
+            ) for rank in range(hvd.local_size())]
+
            if self._horovod_run_eval:
                local_results = eval_coco(self.dataflow, self.predictor)
-                output_partial = os.path.join(
-                    logdir, 'outputs{}-part{}.json'.format(self.global_step, hvd.local_rank()))
-                with open(output_partial, 'w') as f:
+                fname = filenames[hvd.local_rank()]
+                with open(fname, 'w') as f:
                    json.dump(local_results, f)
            self.barrier.eval()
            if hvd.rank() > 0:
                return
            all_results = []
-            for k in range(hvd.local_size()):
-                output_partial = os.path.join(
-                    logdir, 'outputs{}-part{}.json'.format(self.global_step, k))
-                with open(output_partial, 'r') as f:
+            for fname in filenames:
+                with open(fname, 'r') as f:
                    obj = json.load(f)
                all_results.extend(obj)
-                os.unlink(output_partial)
+                os.unlink(fname)

        output_file = os.path.join(
            logdir, 'outputs{}.json'.format(self.global_step))
@@ -615,6 +607,6 @@ if __name__ == '__main__':
        if is_horovod:
            trainer = HorovodTrainer(average=False)
        else:
-            # nccl mode has better speed than cpu mode
+            # nccl mode appears faster than cpu mode
            trainer = SyncMultiGPUTrainerReplicated(cfg.TRAIN.NUM_GPUS, average=False, mode='nccl')
        launch_train_with_config(traincfg, trainer)