Commit fe5d4984 authored by Yuxin Wu's avatar Yuxin Wu

average option in replicated trainers

parent 70c9ba8f
......@@ -179,7 +179,7 @@ class ImageNetModel(ModelDesc):
def _get_optimizer(self):
lr = tf.get_variable('learning_rate', initializer=0.1, trainable=False)
tf.summary.scalar('learning_rate', lr)
tf.summary.scalar('learning_rate-summary', lr)
return tf.train.MomentumOptimizer(lr, 0.9, use_nesterov=True)
@staticmethod
......
......@@ -175,12 +175,16 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
Data-parallel training in "replicated" mode,
where each GPU contains a replicate of the whole model.
It will build one tower on each GPU under its own variable scope.
Each gradient update is averaged across or GPUs through NCCL.
Each gradient update is averaged or summed across or GPUs through NCCL.
It is an equivalent of ``--variable_update=replicated`` in
`tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`_.
"""
def __init__(self, towers, average):
super(SyncMultiGPUReplicatedBuilder, self).__init__(towers)
self._average = average
def build(self, get_grad_fn, get_opt_fn):
"""
Args:
......@@ -207,7 +211,7 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
DataParallelBuilder._check_grad_list(grad_list)
if True:
grads = allreduce_grads(grad_list) # #gpu x #param x 2
grads = allreduce_grads(grad_list, average=self._average) # #gpu x #param x 2
else:
agg_grad_and_vars = average_grads(grad_list, colocation=False, devices=['/cpu:0']) # #param x 2
grads = [] # #gpu x #param x 2
......
......@@ -96,13 +96,14 @@ class LeastLoadedDeviceSetter(object):
return "LeastLoadedDeviceSetter-{}".format(self.worker_device)
def allreduce_grads(all_grads):
def allreduce_grads(all_grads, average):
"""
All-reduce average the gradients among devices. Results are broadcasted to all devices.
Args:
all_grads (K x N x 2): A list of K lists. Each of the list is a list of N (grad, var) tuples.
The variables have to be the same across the K lists.
average (bool): average gradients or not.
Returns:
(K x N x 2): same as input, but each grad is replaced by the average over K lists.
......@@ -122,7 +123,8 @@ def allreduce_grads(all_grads):
for (_, v), g in zip(grad_and_vars, summed):
with tf.device(g.device):
# tensorflow/benchmarks didn't average gradients
g = tf.multiply(g, 1.0 / nr_tower)
if average:
g = tf.multiply(g, 1.0 / nr_tower)
grads_for_a_var.append((g, v))
new_all_grads.append(grads_for_a_var)
......
......@@ -138,13 +138,14 @@ class SyncMultiGPUTrainerReplicated(SingleCostTrainer):
"""
@map_arg(gpus=_int_to_range)
def __init__(self, gpus):
def __init__(self, gpus, average=True):
"""
Args:
gpus ([int]): list of GPU ids.
gpus (int or [int]): list of GPU ids.
average (bool): whether to average or sum gradients.
"""
self.devices = gpus
self._builder = SyncMultiGPUReplicatedBuilder(gpus)
self._builder = SyncMultiGPUReplicatedBuilder(gpus, average)
super(SyncMultiGPUTrainerReplicated, self).__init__()
def _setup_graph(self, input, get_cost_fn, get_opt_fn):
......@@ -283,18 +284,21 @@ class HorovodTrainer(SingleCostTrainer):
(Add other environment variables you need by -x, e.g. PYTHONPATH, PATH)
Note:
1. Gradients are averaged among all processes.
2. If using all GPUs, you can always skip the `CUDA_VISIBLE_DEVICES` option.
1. If using all GPUs, you can always skip the `CUDA_VISIBLE_DEVICES` option.
3. Due to the use of MPI, training is less informative (no progress bar).
2. Due to the use of MPI, training is less informative (no progress bar).
4. MPI often fails to kill all processes. Be sure to check it.
3. MPI often fails to kill all processes. Be sure to check it.
"""
def __init__(self):
def __init__(self, average=True):
"""
Args:
average (bool): whether to average or sum the gradients across processes.
"""
hvd.init()
self.is_chief = hvd.rank() == 0
self._local_rank = hvd.local_rank()
self._average = average
logger.info("Horovod local rank={}".format(self._local_rank))
super(HorovodTrainer, self).__init__()
......@@ -306,7 +310,7 @@ class HorovodTrainer(SingleCostTrainer):
with tf.name_scope("HVDAllReduce"):
for grad, var in grads:
if grad is not None:
avg_grad = hvd.allreduce(grad, average=True)
avg_grad = hvd.allreduce(grad, average=self._average)
averaged_gradients.append((avg_grad, var))
else:
averaged_gradients.append((None, var))
......@@ -323,8 +327,7 @@ class HorovodTrainer(SingleCostTrainer):
op = hvd.broadcast_global_variables(0)
cb = RunOp(
op, run_before=True,
run_as_trigger=False, verbose=True)
cb.chief_only = False
run_as_trigger=False, verbose=True).set_chief_only(False)
return [cb]
@HIDE_DOC
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment