Commit 69465fc7 authored by Yuxin Wu's avatar Yuxin Wu

[MaskRcnn] clean-up #GPU logic

parent 12223dfb
...@@ -5,6 +5,7 @@ import numpy as np ...@@ -5,6 +5,7 @@ import numpy as np
# mode flags --------------------- # mode flags ---------------------
TRAINER = 'replicated' # options: 'horovod', 'replicated' TRAINER = 'replicated' # options: 'horovod', 'replicated'
NUM_GPUS = None # by default, will be set from code
MODE_MASK = True MODE_MASK = True
MODE_FPN = False MODE_FPN = False
......
...@@ -356,9 +356,7 @@ def get_train_dataflow(): ...@@ -356,9 +356,7 @@ def get_train_dataflow():
if config.TRAINER == 'horovod': if config.TRAINER == 'horovod':
ds = MultiThreadMapData(ds, 5, preprocess) ds = MultiThreadMapData(ds, 5, preprocess)
# MPI does not like fork(), but we use it for speed anyway. # MPI does not like fork()
# We only fork once here, which seems to work fine.
ds = PrefetchDataZMQ(ds, 1)
else: else:
ds = MultiProcessMapDataZMQ(ds, 10, preprocess) ds = MultiProcessMapDataZMQ(ds, 10, preprocess)
return ds return ds
...@@ -374,6 +372,7 @@ def get_eval_dataflow(): ...@@ -374,6 +372,7 @@ def get_eval_dataflow():
assert im is not None, fname assert im is not None, fname
return im return im
ds = MapDataComponent(ds, f, 0) ds = MapDataComponent(ds, f, 0)
if config.TRAINER != 'horovod':
ds = PrefetchDataZMQ(ds, 1) ds = PrefetchDataZMQ(ds, 1)
return ds return ds
......
...@@ -12,6 +12,10 @@ import numpy as np ...@@ -12,6 +12,10 @@ import numpy as np
import json import json
import six import six
import tensorflow as tf import tensorflow as tf
try:
import horovod.tensorflow as hvd
except ImportError:
pass
assert six.PY3, "FasterRCNN requires Python 3!" assert six.PY3, "FasterRCNN requires Python 3!"
...@@ -21,7 +25,7 @@ from tensorpack.tfutils.scope_utils import under_name_scope ...@@ -21,7 +25,7 @@ from tensorpack.tfutils.scope_utils import under_name_scope
from tensorpack.tfutils import optimizer from tensorpack.tfutils import optimizer
from tensorpack.tfutils.common import get_tf_version_number from tensorpack.tfutils.common import get_tf_version_number
import tensorpack.utils.viz as tpviz import tensorpack.utils.viz as tpviz
from tensorpack.utils.gpu import get_nr_gpu from tensorpack.utils.gpu import get_num_gpu
from coco import COCODetection from coco import COCODetection
...@@ -47,12 +51,6 @@ from eval import ( ...@@ -47,12 +51,6 @@ from eval import (
import config import config
def get_batch_factor():
nr_gpu = get_nr_gpu()
assert nr_gpu in [1, 2, 4, 8], nr_gpu
return 8 // nr_gpu
def get_model_output_names(): def get_model_output_names():
ret = ['final_boxes', 'final_probs', 'final_labels'] ret = ['final_boxes', 'final_probs', 'final_labels']
if config.MODE_MASK: if config.MODE_MASK:
...@@ -96,13 +94,12 @@ class DetectionModel(ModelDesc): ...@@ -96,13 +94,12 @@ class DetectionModel(ModelDesc):
lr = tf.get_variable('learning_rate', initializer=0.003, trainable=False) lr = tf.get_variable('learning_rate', initializer=0.003, trainable=False)
tf.summary.scalar('learning_rate-summary', lr) tf.summary.scalar('learning_rate-summary', lr)
factor = get_batch_factor() factor = config.NUM_GPUS / 8.
if factor != 1: if factor != 1:
lr = lr / float(factor) lr = lr * factor
opt = tf.train.MomentumOptimizer(lr, 0.9)
opt = optimizer.AccumGradOptimizer(opt, factor)
else:
opt = tf.train.MomentumOptimizer(lr, 0.9) opt = tf.train.MomentumOptimizer(lr, 0.9)
if config.NUM_GPUS < 8:
opt = optimizer.AccumGradOptimizer(opt, 8 // config.NUM_GPUS)
return opt return opt
def fastrcnn_training(self, image, def fastrcnn_training(self, image,
...@@ -522,6 +519,7 @@ class EvalCallback(Callback): ...@@ -522,6 +519,7 @@ class EvalCallback(Callback):
interval = self.trainer.max_epoch // (EVAL_TIMES + 1) interval = self.trainer.max_epoch // (EVAL_TIMES + 1)
self.epochs_to_eval = set([interval * k for k in range(1, EVAL_TIMES + 1)]) self.epochs_to_eval = set([interval * k for k in range(1, EVAL_TIMES + 1)])
self.epochs_to_eval.add(self.trainer.max_epoch) self.epochs_to_eval.add(self.trainer.max_epoch)
logger.info("[EvalCallback] Will evaluate at epoch " + str(sorted(self.epochs_to_eval)))
def _eval(self): def _eval(self):
all_results = eval_coco(self.df, lambda img: detect_one_image(img, self.pred)) all_results = eval_coco(self.df, lambda img: detect_one_image(img, self.pred))
...@@ -542,10 +540,25 @@ class EvalCallback(Callback): ...@@ -542,10 +540,25 @@ class EvalCallback(Callback):
self._eval() self._eval()
def init_config():
if config.TRAINER == 'horovod':
ngpu = hvd.size()
else:
ngpu = get_num_gpu()
assert ngpu % 8 == 0 or 8 % ngpu == 0, ngpu
if config.NUM_GPUS is None:
config.NUM_GPUS = ngpu
else:
if config.TRAINER == 'horovod':
assert config.NUM_GPUS == ngpu
else:
assert config.NUM_GPUS <= ngpu
print_config()
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--gpu', help='comma separated list of GPU(s) to use. Default to all availalbe ones') parser.add_argument('--load', help='load a model for evaluation or training')
parser.add_argument('--load', help='load model for evaluation or training')
parser.add_argument('--logdir', help='log directory', default='train_log/maskrcnn') parser.add_argument('--logdir', help='log directory', default='train_log/maskrcnn')
parser.add_argument('--datadir', help='override config.BASEDIR') parser.add_argument('--datadir', help='override config.BASEDIR')
parser.add_argument('--visualize', action='store_true', help='visualize intermediate results') parser.add_argument('--visualize', action='store_true', help='visualize intermediate results')
...@@ -557,9 +570,6 @@ if __name__ == '__main__': ...@@ -557,9 +570,6 @@ if __name__ == '__main__':
if args.datadir: if args.datadir:
config.BASEDIR = args.datadir config.BASEDIR = args.datadir
if args.gpu:
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
if args.visualize or args.evaluate or args.predict: if args.visualize or args.evaluate or args.predict:
# autotune is too slow for inference # autotune is too slow for inference
os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0' os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'
...@@ -589,14 +599,16 @@ if __name__ == '__main__': ...@@ -589,14 +599,16 @@ if __name__ == '__main__':
os.environ['TF_AUTOTUNE_THRESHOLD'] = '1' os.environ['TF_AUTOTUNE_THRESHOLD'] = '1'
is_horovod = config.TRAINER == 'horovod' is_horovod = config.TRAINER == 'horovod'
if is_horovod: if is_horovod:
import horovod.tensorflow as hvd
hvd.init() hvd.init()
logger.info("Horovod Rank={}, Size={}".format(hvd.rank(), hvd.size())) logger.info("Horovod Rank={}, Size={}".format(hvd.rank(), hvd.size()))
else:
assert 'OMPI_COMM_WORLD_SIZE' not in os.environ
if not is_horovod or hvd.rank() == 0: if not is_horovod or hvd.rank() == 0:
logger.set_logger_dir(args.logdir, 'd') logger.set_logger_dir(args.logdir, 'd')
print_config()
factor = get_batch_factor() init_config()
factor = 8. / config.NUM_GPUS
stepnum = config.STEPS_PER_EPOCH stepnum = config.STEPS_PER_EPOCH
# warmup is step based, lr is epoch based # warmup is step based, lr is epoch based
...@@ -607,6 +619,8 @@ if __name__ == '__main__': ...@@ -607,6 +619,8 @@ if __name__ == '__main__':
mult = 0.1 ** (idx + 1) mult = 0.1 ** (idx + 1)
lr_schedule.append( lr_schedule.append(
(steps * factor // stepnum, config.BASE_LR * mult)) (steps * factor // stepnum, config.BASE_LR * mult))
logger.info("Warmup Up Schedule: " + str(warmup_schedule))
logger.info("LR Schedule: " + str(lr_schedule))
callbacks = [ callbacks = [
PeriodicCallback( PeriodicCallback(
...@@ -619,12 +633,10 @@ if __name__ == '__main__': ...@@ -619,12 +633,10 @@ if __name__ == '__main__':
EvalCallback(), EvalCallback(),
PeakMemoryTracker(), PeakMemoryTracker(),
EstimatedTimeLeft(), EstimatedTimeLeft(),
SessionRunTimeout(60000).set_chief_only(True), # 1 minute timeout
] ]
if not is_horovod: if not is_horovod:
callbacks.extend([ callbacks.append(GPUUtilizationTracker())
GPUUtilizationTracker(),
SessionRunTimeout(60000), # 1 minute timeout
])
cfg = TrainConfig( cfg = TrainConfig(
model=get_model(), model=get_model(),
...@@ -634,9 +646,10 @@ if __name__ == '__main__': ...@@ -634,9 +646,10 @@ if __name__ == '__main__':
max_epoch=config.LR_SCHEDULE[-1] * factor // stepnum, max_epoch=config.LR_SCHEDULE[-1] * factor // stepnum,
session_init=get_model_loader(args.load) if args.load else None, session_init=get_model_loader(args.load) if args.load else None,
) )
# nccl mode gives the best speed
if is_horovod: if is_horovod:
# horovod mode has the best speed for this model
trainer = HorovodTrainer() trainer = HorovodTrainer()
else: else:
trainer = SyncMultiGPUTrainerReplicated(get_nr_gpu(), mode='nccl') # nccl mode has better speed than cpu mode
trainer = SyncMultiGPUTrainerReplicated(config.NUM_GPUS, mode='nccl')
launch_train_with_config(cfg, trainer) launch_train_with_config(cfg, trainer)
...@@ -26,7 +26,7 @@ class GPUUtilizationTracker(Callback): ...@@ -26,7 +26,7 @@ class GPUUtilizationTracker(Callback):
within the epoch (the trigger_epoch time was not included), within the epoch (the trigger_epoch time was not included),
and write average utilization to monitors. and write average utilization to monitors.
This callback creates a process, therefore it cannot be used with MPI. This callback creates a process, therefore it's not safe to be used with MPI.
""" """
_chief_only = False _chief_only = False
...@@ -53,7 +53,6 @@ class GPUUtilizationTracker(Callback): ...@@ -53,7 +53,6 @@ class GPUUtilizationTracker(Callback):
assert len(self._devices), "[GPUUtilizationTracker] No GPU device given!" assert len(self._devices), "[GPUUtilizationTracker] No GPU device given!"
def _before_train(self): def _before_train(self):
assert 'OMPI_COMM_WORLD_SIZE' not in os.environ, "GPUUtilizationTracker cannot be used under MPI!"
self._evt = mp.Event() self._evt = mp.Event()
self._stop_evt = mp.Event() self._stop_evt = mp.Event()
self._queue = mp.Queue() self._queue = mp.Queue()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment