Correct average=False && use_nccl=False

54c5a42d · Yuxin Wu · f4f41711 · 54c5a42d · 54c5a42d · 54c5a42d
Commit 54c5a42d authored Feb 08, 2018 by Yuxin Wu
5 changed files
--- a/tensorpack/callbacks/prof.py
+++ b/tensorpack/callbacks/prof.py
@@ -100,6 +100,9 @@ class GPUUtilizationTracker(Callback):
                    if stop_evt.is_set():   # or on exit
                        return
                    evt.clear()
+                    # Ignore the last datapoint. Usually is zero, makes us underestimate the util.
+                    stats -= data
+                    cnt -= 1
                    rst_queue.put(stats / cnt)
                    break

--- a/tensorpack/graph_builder/training.py
+++ b/tensorpack/graph_builder/training.py
@@ -221,7 +221,9 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
        if self._use_nccl:
            self.grads = allreduce_grads(grad_list, average=self._average)  # #gpu x #param x 2
        else:
-            agg_grad_and_vars = average_grads(grad_list, colocation=False, devices=['/cpu:0'])    # #param x 2
+            agg_grad_and_vars = average_grads(
+                grad_list, colocation=False,
+                devices=['/cpu:0'], average=self._average)    # #param x 2
            self.grads = []  # #gpu x #param x 2
            for grad_and_vars in grad_list:   # grad_and_vars: #paramx2
                # take v from each tower, and g from average.

--- a/tensorpack/graph_builder/utils.py
+++ b/tensorpack/graph_builder/utils.py
@@ -133,7 +133,7 @@ def allreduce_grads(all_grads, average):
    return ret
-def average_grads(all_grads, colocation=True, devices=None):
+def average_grads(all_grads, colocation=True, devices=None, average=True):
    """
    Average the gradients.
@@ -143,6 +143,7 @@ def average_grads(all_grads, colocation=True, devices=None):
        colocation (bool): colocate gradient averaging on the device of the variable.
        devices (list[str]): assign the averaging to these device in
            round-robin. Cannot be used together with ``colocation``.
+        average (bool): do average or sum
    Returns:
        (N x 2): A list of N (grad, var) tuples, where grad is averaged over K.
@@ -154,6 +155,13 @@ def average_grads(all_grads, colocation=True, devices=None):
    nr_tower = len(all_grads)
    if nr_tower == 1:
        return all_grads[0]
+    def aggregate(grads):
+        if average:
+            return tf.multiply(tf.add_n(grads), 1.0 / nr_tower)
+        else:
+            return tf.add_n(grads)
    ret = []
    with tf.name_scope('AvgGrad'):
        for idx, grad_and_vars in enumerate(zip(*all_grads)):
@@ -163,16 +171,13 @@ def average_grads(all_grads, colocation=True, devices=None):
            if colocation:
                with tf.device(v.device):       # colocate summed grad with var
-                    grad = tf.multiply(
+                    grad = aggregate(grads)
-                        tf.add_n(grads), 1.0 / nr_tower)
            elif devices is None:
-                grad = tf.multiply(
+                grad = aggregate(grads)
-                    tf.add_n(grads), 1.0 / nr_tower)
            else:
                dev = devices[idx % len(devices)]
                with tf.device(dev):
-                    grad = tf.multiply(
+                    grad = aggregate(grads)
-                        tf.add_n(grads), 1.0 / nr_tower)
            ret.append((grad, v))
    return ret

--- a/tensorpack/tfutils/common.py
+++ b/tensorpack/tfutils/common.py
@@ -58,13 +58,7 @@ def get_global_step_var():
    """
    scope = tf.VariableScope(reuse=False, name='')  # the root vs
    with tf.variable_scope(scope):
-        if get_tf_version_number() <= 1.0:
+        var = tf.train.get_or_create_global_step()
-            var = tf.get_variable('global_step',
-                                  initializer=tf.constant(0, dtype=tf.int64),
-                                  trainable=False, dtype=tf.int64)
-            tf.add_to_collection(tf.GraphKeys.GLOBAL_STEP, var)
-        else:
-            var = tf.train.get_or_create_global_step()
    return var

--- a/tensorpack/train/trainers.py
+++ b/tensorpack/train/trainers.py
@@ -143,6 +143,7 @@ class SyncMultiGPUTrainerReplicated(SingleCostTrainer):
        Args:
            gpus (int or [int]): list of GPU ids.
            average (bool): whether to average or sum gradients.
+            use_nccl (bool): use NCCL or TensorFlow copy to reduce.
        """
        self.devices = gpus
        self._builder = SyncMultiGPUReplicatedBuilder(gpus, average, use_nccl)