Commit 820bcac1 authored by Yuxin Wu's avatar Yuxin Wu

[MaskRCNN] support horovod

parent 456f5675
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
import numpy as np import numpy as np
# mode flags --------------------- # mode flags ---------------------
TRAINER = 'replicated' # options: 'horovod', 'replicated'
MODE_MASK = True MODE_MASK = True
MODE_FPN = False MODE_FPN = False
......
...@@ -8,7 +8,8 @@ import itertools ...@@ -8,7 +8,8 @@ import itertools
from tensorpack.utils.argtools import memoized, log_once from tensorpack.utils.argtools import memoized, log_once
from tensorpack.dataflow import ( from tensorpack.dataflow import (
imgaug, TestDataSpeed, PrefetchDataZMQ, MultiProcessMapDataZMQ, imgaug, TestDataSpeed,
PrefetchDataZMQ, MultiProcessMapDataZMQ, MultiThreadMapData,
MapDataComponent, DataFromList) MapDataComponent, DataFromList)
from tensorpack.utils import logger from tensorpack.utils import logger
# import tensorpack.utils.viz as tpviz # import tensorpack.utils.viz as tpviz
...@@ -353,7 +354,13 @@ def get_train_dataflow(): ...@@ -353,7 +354,13 @@ def get_train_dataflow():
# tpviz.interactive_imshow(viz) # tpviz.interactive_imshow(viz)
return ret return ret
ds = MultiProcessMapDataZMQ(ds, 10, preprocess) if config.TRAINER == 'horovod':
ds = MultiThreadMapData(ds, 5, preprocess)
# MPI does not like fork(), but we use it for speed anyway.
# We only fork once here, which seems to work fine.
ds = PrefetchDataZMQ(ds, 1)
else:
ds = MultiProcessMapDataZMQ(ds, 10, preprocess)
return ds return ds
......
...@@ -94,7 +94,7 @@ class DetectionModel(ModelDesc): ...@@ -94,7 +94,7 @@ class DetectionModel(ModelDesc):
def optimizer(self): def optimizer(self):
lr = tf.get_variable('learning_rate', initializer=0.003, trainable=False) lr = tf.get_variable('learning_rate', initializer=0.003, trainable=False)
tf.summary.scalar('learning_rate', lr) tf.summary.scalar('learning_rate-summary', lr)
factor = get_batch_factor() factor = get_batch_factor()
if factor != 1: if factor != 1:
...@@ -586,7 +586,15 @@ if __name__ == '__main__': ...@@ -586,7 +586,15 @@ if __name__ == '__main__':
COCODetection(config.BASEDIR, 'val2014') # Only to load the class names into caches COCODetection(config.BASEDIR, 'val2014') # Only to load the class names into caches
predict(pred, args.predict) predict(pred, args.predict)
else: else:
logger.set_logger_dir(args.logdir) os.environ['TF_AUTOTUNE_THRESHOLD'] = '1'
is_horovod = config.TRAINER == 'horovod'
if is_horovod:
import horovod.tensorflow as hvd
hvd.init()
logger.info("Horovod Rank={}, Size={}".format(hvd.rank(), hvd.size()))
if not is_horovod or hvd.rank() == 0:
logger.set_logger_dir(args.logdir, 'd')
print_config() print_config()
factor = get_batch_factor() factor = get_batch_factor()
stepnum = config.STEPS_PER_EPOCH stepnum = config.STEPS_PER_EPOCH
...@@ -600,27 +608,35 @@ if __name__ == '__main__': ...@@ -600,27 +608,35 @@ if __name__ == '__main__':
lr_schedule.append( lr_schedule.append(
(steps * factor // stepnum, config.BASE_LR * mult)) (steps * factor // stepnum, config.BASE_LR * mult))
callbacks = [
PeriodicCallback(
ModelSaver(max_to_keep=10, keep_checkpoint_every_n_hours=1),
every_k_epochs=20),
# linear warmup
ScheduledHyperParamSetter(
'learning_rate', warmup_schedule, interp='linear', step_based=True),
ScheduledHyperParamSetter('learning_rate', lr_schedule),
EvalCallback(),
PeakMemoryTracker(),
EstimatedTimeLeft(),
]
if not is_horovod:
callbacks.extend([
GPUUtilizationTracker(),
SessionRunTimeout(60000), # 1 minute timeout
])
cfg = TrainConfig( cfg = TrainConfig(
model=get_model(), model=get_model(),
data=QueueInput(get_train_dataflow()), data=QueueInput(get_train_dataflow()),
callbacks=[ callbacks=callbacks,
PeriodicCallback(
ModelSaver(max_to_keep=10, keep_checkpoint_every_n_hours=1),
every_k_epochs=20),
# linear warmup
ScheduledHyperParamSetter(
'learning_rate', warmup_schedule, interp='linear', step_based=True),
ScheduledHyperParamSetter('learning_rate', lr_schedule),
EvalCallback(),
GPUUtilizationTracker(),
PeakMemoryTracker(),
EstimatedTimeLeft(),
SessionRunTimeout(60000), # 1 minute timeout
],
steps_per_epoch=stepnum, steps_per_epoch=stepnum,
max_epoch=config.LR_SCHEDULE[-1] * factor // stepnum, max_epoch=config.LR_SCHEDULE[-1] * factor // stepnum,
session_init=get_model_loader(args.load) if args.load else None, session_init=get_model_loader(args.load) if args.load else None,
) )
# nccl mode gives the best speed # nccl mode gives the best speed
trainer = SyncMultiGPUTrainerReplicated(get_nr_gpu(), mode='nccl') if is_horovod:
trainer = HorovodTrainer()
else:
trainer = SyncMultiGPUTrainerReplicated(get_nr_gpu(), mode='nccl')
launch_train_with_config(cfg, trainer) launch_train_with_config(cfg, trainer)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment