Commit 820bcac1 authored by Yuxin Wu's avatar Yuxin Wu

[MaskRCNN] support horovod

parent 456f5675
......@@ -4,6 +4,7 @@
import numpy as np
# mode flags ---------------------
TRAINER = 'replicated' # options: 'horovod', 'replicated'
MODE_MASK = True
MODE_FPN = False
......
......@@ -8,7 +8,8 @@ import itertools
from tensorpack.utils.argtools import memoized, log_once
from tensorpack.dataflow import (
imgaug, TestDataSpeed, PrefetchDataZMQ, MultiProcessMapDataZMQ,
imgaug, TestDataSpeed,
PrefetchDataZMQ, MultiProcessMapDataZMQ, MultiThreadMapData,
MapDataComponent, DataFromList)
from tensorpack.utils import logger
# import tensorpack.utils.viz as tpviz
......@@ -353,6 +354,12 @@ def get_train_dataflow():
# tpviz.interactive_imshow(viz)
return ret
if config.TRAINER == 'horovod':
ds = MultiThreadMapData(ds, 5, preprocess)
# MPI does not like fork(), but we use it for speed anyway.
# We only fork once here, which seems to work fine.
ds = PrefetchDataZMQ(ds, 1)
else:
ds = MultiProcessMapDataZMQ(ds, 10, preprocess)
return ds
......
......@@ -94,7 +94,7 @@ class DetectionModel(ModelDesc):
def optimizer(self):
lr = tf.get_variable('learning_rate', initializer=0.003, trainable=False)
tf.summary.scalar('learning_rate', lr)
tf.summary.scalar('learning_rate-summary', lr)
factor = get_batch_factor()
if factor != 1:
......@@ -586,7 +586,15 @@ if __name__ == '__main__':
COCODetection(config.BASEDIR, 'val2014') # Only to load the class names into caches
predict(pred, args.predict)
else:
logger.set_logger_dir(args.logdir)
os.environ['TF_AUTOTUNE_THRESHOLD'] = '1'
is_horovod = config.TRAINER == 'horovod'
if is_horovod:
import horovod.tensorflow as hvd
hvd.init()
logger.info("Horovod Rank={}, Size={}".format(hvd.rank(), hvd.size()))
if not is_horovod or hvd.rank() == 0:
logger.set_logger_dir(args.logdir, 'd')
print_config()
factor = get_batch_factor()
stepnum = config.STEPS_PER_EPOCH
......@@ -600,10 +608,7 @@ if __name__ == '__main__':
lr_schedule.append(
(steps * factor // stepnum, config.BASE_LR * mult))
cfg = TrainConfig(
model=get_model(),
data=QueueInput(get_train_dataflow()),
callbacks=[
callbacks = [
PeriodicCallback(
ModelSaver(max_to_keep=10, keep_checkpoint_every_n_hours=1),
every_k_epochs=20),
......@@ -612,15 +617,26 @@ if __name__ == '__main__':
'learning_rate', warmup_schedule, interp='linear', step_based=True),
ScheduledHyperParamSetter('learning_rate', lr_schedule),
EvalCallback(),
GPUUtilizationTracker(),
PeakMemoryTracker(),
EstimatedTimeLeft(),
]
if not is_horovod:
callbacks.extend([
GPUUtilizationTracker(),
SessionRunTimeout(60000), # 1 minute timeout
],
])
cfg = TrainConfig(
model=get_model(),
data=QueueInput(get_train_dataflow()),
callbacks=callbacks,
steps_per_epoch=stepnum,
max_epoch=config.LR_SCHEDULE[-1] * factor // stepnum,
session_init=get_model_loader(args.load) if args.load else None,
)
# nccl mode gives the best speed
if is_horovod:
trainer = HorovodTrainer()
else:
trainer = SyncMultiGPUTrainerReplicated(get_nr_gpu(), mode='nccl')
launch_train_with_config(cfg, trainer)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment