Commit c4a68c6c authored by Yuxin Wu's avatar Yuxin Wu

[MaskRCNN] misc small updates

parent 847fae12
...@@ -24,7 +24,8 @@ class AttrDict(): ...@@ -24,7 +24,8 @@ class AttrDict():
def __setattr__(self, name, value): def __setattr__(self, name, value):
if self._freezed and name not in self.__dict__: if self._freezed and name not in self.__dict__:
raise AttributeError("Cannot create new attribute!") raise AttributeError(
"Config was freezed! Unknown config: {}".format(name))
super().__setattr__(name, value) super().__setattr__(name, value)
def __str__(self): def __str__(self):
...@@ -54,11 +55,11 @@ class AttrDict(): ...@@ -54,11 +55,11 @@ class AttrDict():
v = eval(v) v = eval(v)
setattr(dic, key, v) setattr(dic, key, v)
def freeze(self): def freeze(self, freezed=True):
self._freezed = True self._freezed = freezed
for v in self.__dict__.values(): for v in self.__dict__.values():
if isinstance(v, AttrDict): if isinstance(v, AttrDict):
v.freeze() v.freeze(freezed)
# avoid silent bugs # avoid silent bugs
def __eq__(self, _): def __eq__(self, _):
...@@ -95,7 +96,6 @@ _C.BACKBONE.FREEZE_AT = 2 # options: 0, 1, 2 ...@@ -95,7 +96,6 @@ _C.BACKBONE.FREEZE_AT = 2 # options: 0, 1, 2
# Use a base model with TF-preferred padding mode, # Use a base model with TF-preferred padding mode,
# which may pad more pixels on right/bottom than top/left. # which may pad more pixels on right/bottom than top/left.
# See https://github.com/tensorflow/tensorflow/issues/18213 # See https://github.com/tensorflow/tensorflow/issues/18213
# In tensorpack model zoo, ResNet models with TF_PAD_MODE=False are marked with "-AlignPadding". # In tensorpack model zoo, ResNet models with TF_PAD_MODE=False are marked with "-AlignPadding".
# All other models under `ResNet/` in the model zoo are using TF_PAD_MODE=True. # All other models under `ResNet/` in the model zoo are using TF_PAD_MODE=True.
# Using either one should probably give the same performance. # Using either one should probably give the same performance.
...@@ -110,11 +110,16 @@ _C.TRAIN.BASE_LR = 1e-2 # defined for a total batch size of 8. Otherwise it wil ...@@ -110,11 +110,16 @@ _C.TRAIN.BASE_LR = 1e-2 # defined for a total batch size of 8. Otherwise it wil
_C.TRAIN.WARMUP = 1000 # in terms of iterations. This is not affected by #GPUs _C.TRAIN.WARMUP = 1000 # in terms of iterations. This is not affected by #GPUs
_C.TRAIN.STEPS_PER_EPOCH = 500 _C.TRAIN.STEPS_PER_EPOCH = 500
# LR_SCHEDULE means "steps" only when total batch size is 8. # LR_SCHEDULE means equivalent steps when the total batch size is 8.
# Otherwise the actual steps to decrease learning rate are computed from the schedule. # When the total bs!=8, the actual iterations to decrease learning rate, and
# the base learning rate are computed from BASE_LR and LR_SCHEDULE.
# Therefore, there is *no need* to modify the config if you only change the number of GPUs. # Therefore, there is *no need* to modify the config if you only change the number of GPUs.
# LR_SCHEDULE = [120000, 160000, 180000] # "1x" schedule in detectron
_C.TRAIN.LR_SCHEDULE = [240000, 320000, 360000] # "2x" schedule in detectron # _C.TRAIN.LR_SCHEDULE = [120000, 160000, 180000] # "1x" schedule in detectron
_C.TRAIN.LR_SCHEDULE = [240000, 320000, 360000] # "2x" schedule in detectron
# Longer schedules for from-scratch training (https://arxiv.org/abs/1811.08883):
# _C.TRAIN.LR_SCHEDULE = [960000, 1040000, 1080000] # "6x" schedule in detectron
# _C.TRAIN.LR_SCHEDULE = [1500000, 1580000, 1620000] # "9x" schedule in detectron
_C.TRAIN.EVAL_PERIOD = 25 # period (epochs) to run eva _C.TRAIN.EVAL_PERIOD = 25 # period (epochs) to run eva
# preprocessing -------------------- # preprocessing --------------------
...@@ -167,8 +172,7 @@ _C.FPN.ANCHOR_STRIDES = (4, 8, 16, 32, 64) # strides for each FPN level. Must b ...@@ -167,8 +172,7 @@ _C.FPN.ANCHOR_STRIDES = (4, 8, 16, 32, 64) # strides for each FPN level. Must b
_C.FPN.PROPOSAL_MODE = 'Level' # 'Level', 'Joint' _C.FPN.PROPOSAL_MODE = 'Level' # 'Level', 'Joint'
_C.FPN.NUM_CHANNEL = 256 _C.FPN.NUM_CHANNEL = 256
_C.FPN.NORM = 'None' # 'None', 'GN' _C.FPN.NORM = 'None' # 'None', 'GN'
# conv head and fc head are only used in FPN. # The head option is only used in FPN. For C4 models, the head is C5
# For C4 models, the head is C5
_C.FPN.FRCNN_HEAD_FUNC = 'fastrcnn_2fc_head' _C.FPN.FRCNN_HEAD_FUNC = 'fastrcnn_2fc_head'
# choices: fastrcnn_2fc_head, fastrcnn_4conv1fc_{,gn_}head # choices: fastrcnn_2fc_head, fastrcnn_4conv1fc_{,gn_}head
_C.FPN.FRCNN_CONV_HEAD_DIM = 256 _C.FPN.FRCNN_CONV_HEAD_DIM = 256
...@@ -192,11 +196,14 @@ _C.TEST.RESULT_SCORE_THRESH = 0.05 ...@@ -192,11 +196,14 @@ _C.TEST.RESULT_SCORE_THRESH = 0.05
_C.TEST.RESULT_SCORE_THRESH_VIS = 0.3 # only visualize confident results _C.TEST.RESULT_SCORE_THRESH_VIS = 0.3 # only visualize confident results
_C.TEST.RESULTS_PER_IM = 100 _C.TEST.RESULTS_PER_IM = 100
_C.freeze() # avoid typo / wrong config keys
def finalize_configs(is_training): def finalize_configs(is_training):
""" """
Run some sanity checks, and populate some configs from others Run some sanity checks, and populate some configs from others
""" """
_C.freeze(False) # populate new keys now
_C.DATA.NUM_CLASS = _C.DATA.NUM_CATEGORY + 1 # +1 background _C.DATA.NUM_CLASS = _C.DATA.NUM_CATEGORY + 1 # +1 background
_C.DATA.BASEDIR = os.path.expanduser(_C.DATA.BASEDIR) _C.DATA.BASEDIR = os.path.expanduser(_C.DATA.BASEDIR)
......
...@@ -232,18 +232,9 @@ class ResNetFPNModel(DetectionModel): ...@@ -232,18 +232,9 @@ class ResNetFPNModel(DetectionModel):
) # NR_GT x height x width ) # NR_GT x height x width
return ret return ret
def slice_feature_and_anchors(self, image_shape2d, p23456, anchors): def slice_feature_and_anchors(self, p23456, anchors):
for i, stride in enumerate(cfg.FPN.ANCHOR_STRIDES): for i, stride in enumerate(cfg.FPN.ANCHOR_STRIDES):
with tf.name_scope('FPN_slice_lvl{}'.format(i)): with tf.name_scope('FPN_slice_lvl{}'.format(i)):
if i < 3:
# Images are padded for p5, which are too large for p2-p4.
# This seems to have no effect on mAP.
pi = p23456[i]
target_shape = tf.to_int32(tf.ceil(tf.to_float(image_shape2d) * (1.0 / stride)))
p23456[i] = tf.slice(pi, [0, 0, 0, 0],
tf.concat([[-1, -1], target_shape], axis=0))
p23456[i].set_shape([1, pi.shape[1], None, None])
anchors[i] = anchors[i].narrow_to(p23456[i]) anchors[i] = anchors[i].narrow_to(p23456[i])
def backbone(self, image): def backbone(self, image):
...@@ -260,7 +251,7 @@ class ResNetFPNModel(DetectionModel): ...@@ -260,7 +251,7 @@ class ResNetFPNModel(DetectionModel):
all_anchors_fpn[i], all_anchors_fpn[i],
inputs['anchor_labels_lvl{}'.format(i + 2)], inputs['anchor_labels_lvl{}'.format(i + 2)],
inputs['anchor_boxes_lvl{}'.format(i + 2)]) for i in range(len(all_anchors_fpn))] inputs['anchor_boxes_lvl{}'.format(i + 2)]) for i in range(len(all_anchors_fpn))]
self.slice_feature_and_anchors(image_shape2d, features, multilevel_anchors) self.slice_feature_and_anchors(features, multilevel_anchors)
# Multi-Level RPN Proposals # Multi-Level RPN Proposals
rpn_outputs = [rpn_head('rpn', pi, cfg.FPN.NUM_CHANNEL, len(cfg.RPN.ANCHOR_RATIOS)) rpn_outputs = [rpn_head('rpn', pi, cfg.FPN.NUM_CHANNEL, len(cfg.RPN.ANCHOR_RATIOS))
...@@ -472,23 +463,24 @@ class EvalCallback(Callback): ...@@ -472,23 +463,24 @@ class EvalCallback(Callback):
futures.append(executor.submit(eval_coco, dataflow, pred, pbar)) futures.append(executor.submit(eval_coco, dataflow, pred, pbar))
all_results = list(itertools.chain(*[fut.result() for fut in futures])) all_results = list(itertools.chain(*[fut.result() for fut in futures]))
else: else:
filenames = [os.path.join(
logdir, 'outputs{}-part{}.json'.format(self.global_step, rank)
) for rank in range(hvd.local_size())]
if self._horovod_run_eval: if self._horovod_run_eval:
local_results = eval_coco(self.dataflow, self.predictor) local_results = eval_coco(self.dataflow, self.predictor)
output_partial = os.path.join( fname = filenames[hvd.local_rank()]
logdir, 'outputs{}-part{}.json'.format(self.global_step, hvd.local_rank())) with open(fname, 'w') as f:
with open(output_partial, 'w') as f:
json.dump(local_results, f) json.dump(local_results, f)
self.barrier.eval() self.barrier.eval()
if hvd.rank() > 0: if hvd.rank() > 0:
return return
all_results = [] all_results = []
for k in range(hvd.local_size()): for fname in filenames:
output_partial = os.path.join( with open(fname, 'r') as f:
logdir, 'outputs{}-part{}.json'.format(self.global_step, k))
with open(output_partial, 'r') as f:
obj = json.load(f) obj = json.load(f)
all_results.extend(obj) all_results.extend(obj)
os.unlink(output_partial) os.unlink(fname)
output_file = os.path.join( output_file = os.path.join(
logdir, 'outputs{}.json'.format(self.global_step)) logdir, 'outputs{}.json'.format(self.global_step))
...@@ -615,6 +607,6 @@ if __name__ == '__main__': ...@@ -615,6 +607,6 @@ if __name__ == '__main__':
if is_horovod: if is_horovod:
trainer = HorovodTrainer(average=False) trainer = HorovodTrainer(average=False)
else: else:
# nccl mode has better speed than cpu mode # nccl mode appears faster than cpu mode
trainer = SyncMultiGPUTrainerReplicated(cfg.TRAIN.NUM_GPUS, average=False, mode='nccl') trainer = SyncMultiGPUTrainerReplicated(cfg.TRAIN.NUM_GPUS, average=False, mode='nccl')
launch_train_with_config(traincfg, trainer) launch_train_with_config(traincfg, trainer)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment