average option in replicated trainers

fe5d4984 · Yuxin Wu · 70c9ba8f · fe5d4984 · fe5d4984 · fe5d4984
Commit fe5d4984 authored Feb 02, 2018 by Yuxin Wu
4 changed files
--- a/examples/ResNet/imagenet_utils.py
+++ b/examples/ResNet/imagenet_utils.py
@@ -179,7 +179,7 @@ class ImageNetModel(ModelDesc):

    def _get_optimizer(self):
        lr = tf.get_variable('learning_rate', initializer=0.1, trainable=False)
-        tf.summary.scalar('learning_rate', lr)
+        tf.summary.scalar('learning_rate-summary', lr)
        return tf.train.MomentumOptimizer(lr, 0.9, use_nesterov=True)

    @staticmethod

--- a/tensorpack/graph_builder/training.py
+++ b/tensorpack/graph_builder/training.py
@@ -175,12 +175,16 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
    Data-parallel training in "replicated" mode,
    where each GPU contains a replicate of the whole model.
    It will build one tower on each GPU under its own variable scope.
-    Each gradient update is averaged across or GPUs through NCCL.
+    Each gradient update is averaged or summed across or GPUs through NCCL.

    It is an equivalent of ``--variable_update=replicated`` in
    `tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`_.
    """

+    def __init__(self, towers, average):
+        super(SyncMultiGPUReplicatedBuilder, self).__init__(towers)
+        self._average = average
+
    def build(self, get_grad_fn, get_opt_fn):
        """
        Args:
@@ -207,7 +211,7 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
        DataParallelBuilder._check_grad_list(grad_list)

        if True:
-            grads = allreduce_grads(grad_list)  # #gpu x #param x 2
+            grads = allreduce_grads(grad_list, average=self._average)  # #gpu x #param x 2
        else:
            agg_grad_and_vars = average_grads(grad_list, colocation=False, devices=['/cpu:0'])    # #param x 2
            grads = []  # #gpu x #param x 2

--- a/tensorpack/graph_builder/utils.py
+++ b/tensorpack/graph_builder/utils.py
@@ -96,13 +96,14 @@ class LeastLoadedDeviceSetter(object):
        return "LeastLoadedDeviceSetter-{}".format(self.worker_device)


-def allreduce_grads(all_grads):
+def allreduce_grads(all_grads, average):
    """
    All-reduce average the gradients among devices. Results are broadcasted to all devices.

    Args:
        all_grads (K x N x 2): A list of K lists. Each of the list is a list of N (grad, var) tuples.
            The variables have to be the same across the K lists.
+        average (bool): average gradients or not.

    Returns:
        (K x N x 2): same as input, but each grad is replaced by the average over K lists.
@@ -122,7 +123,8 @@ def allreduce_grads(all_grads):
            for (_, v), g in zip(grad_and_vars, summed):
                with tf.device(g.device):
                    # tensorflow/benchmarks didn't average gradients
-                    g = tf.multiply(g, 1.0 / nr_tower)
+                    if average:
+                        g = tf.multiply(g, 1.0 / nr_tower)
                    grads_for_a_var.append((g, v))
            new_all_grads.append(grads_for_a_var)


--- a/tensorpack/train/trainers.py
+++ b/tensorpack/train/trainers.py
@@ -138,13 +138,14 @@ class SyncMultiGPUTrainerReplicated(SingleCostTrainer):
    """

    @map_arg(gpus=_int_to_range)
-    def __init__(self, gpus):
+    def __init__(self, gpus, average=True):
        """
        Args:
-            gpus ([int]): list of GPU ids.
+            gpus (int or [int]): list of GPU ids.
+            average (bool): whether to average or sum gradients.
        """
        self.devices = gpus
-        self._builder = SyncMultiGPUReplicatedBuilder(gpus)
+        self._builder = SyncMultiGPUReplicatedBuilder(gpus, average)
        super(SyncMultiGPUTrainerReplicated, self).__init__()

    def _setup_graph(self, input, get_cost_fn, get_opt_fn):
@@ -283,18 +284,21 @@ class HorovodTrainer(SingleCostTrainer):
        (Add other environment variables you need by -x, e.g. PYTHONPATH, PATH)

    Note:
-        1. Gradients are averaged among all processes.
-
-        2. If using all GPUs, you can always skip the `CUDA_VISIBLE_DEVICES` option.
+        1. If using all GPUs, you can always skip the `CUDA_VISIBLE_DEVICES` option.

-        3. Due to the use of MPI, training is less informative (no progress bar).
+        2. Due to the use of MPI, training is less informative (no progress bar).

-        4. MPI often fails to kill all processes. Be sure to check it.
+        3. MPI often fails to kill all processes. Be sure to check it.
    """
-    def __init__(self):
+    def __init__(self, average=True):
+        """
+        Args:
+            average (bool): whether to average or sum the gradients across processes.
+        """
        hvd.init()
        self.is_chief = hvd.rank() == 0
        self._local_rank = hvd.local_rank()
+        self._average = average
        logger.info("Horovod local rank={}".format(self._local_rank))
        super(HorovodTrainer, self).__init__()

@@ -306,7 +310,7 @@ class HorovodTrainer(SingleCostTrainer):
        with tf.name_scope("HVDAllReduce"):
            for grad, var in grads:
                if grad is not None:
-                    avg_grad = hvd.allreduce(grad, average=True)
+                    avg_grad = hvd.allreduce(grad, average=self._average)
                    averaged_gradients.append((avg_grad, var))
                else:
                    averaged_gradients.append((None, var))
@@ -323,8 +327,7 @@ class HorovodTrainer(SingleCostTrainer):
            op = hvd.broadcast_global_variables(0)
        cb = RunOp(
            op, run_before=True,
-            run_as_trigger=False, verbose=True)
-        cb.chief_only = False
+            run_as_trigger=False, verbose=True).set_chief_only(False)
        return [cb]

    @HIDE_DOC