Commit 54c5a42d authored by Yuxin Wu's avatar Yuxin Wu

Correct average=False && use_nccl=False

parent f4f41711
...@@ -100,6 +100,9 @@ class GPUUtilizationTracker(Callback): ...@@ -100,6 +100,9 @@ class GPUUtilizationTracker(Callback):
if stop_evt.is_set(): # or on exit if stop_evt.is_set(): # or on exit
return return
evt.clear() evt.clear()
# Ignore the last datapoint. Usually is zero, makes us underestimate the util.
stats -= data
cnt -= 1
rst_queue.put(stats / cnt) rst_queue.put(stats / cnt)
break break
......
...@@ -221,7 +221,9 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder): ...@@ -221,7 +221,9 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
if self._use_nccl: if self._use_nccl:
self.grads = allreduce_grads(grad_list, average=self._average) # #gpu x #param x 2 self.grads = allreduce_grads(grad_list, average=self._average) # #gpu x #param x 2
else: else:
agg_grad_and_vars = average_grads(grad_list, colocation=False, devices=['/cpu:0']) # #param x 2 agg_grad_and_vars = average_grads(
grad_list, colocation=False,
devices=['/cpu:0'], average=self._average) # #param x 2
self.grads = [] # #gpu x #param x 2 self.grads = [] # #gpu x #param x 2
for grad_and_vars in grad_list: # grad_and_vars: #paramx2 for grad_and_vars in grad_list: # grad_and_vars: #paramx2
# take v from each tower, and g from average. # take v from each tower, and g from average.
......
...@@ -133,7 +133,7 @@ def allreduce_grads(all_grads, average): ...@@ -133,7 +133,7 @@ def allreduce_grads(all_grads, average):
return ret return ret
def average_grads(all_grads, colocation=True, devices=None): def average_grads(all_grads, colocation=True, devices=None, average=True):
""" """
Average the gradients. Average the gradients.
...@@ -143,6 +143,7 @@ def average_grads(all_grads, colocation=True, devices=None): ...@@ -143,6 +143,7 @@ def average_grads(all_grads, colocation=True, devices=None):
colocation (bool): colocate gradient averaging on the device of the variable. colocation (bool): colocate gradient averaging on the device of the variable.
devices (list[str]): assign the averaging to these device in devices (list[str]): assign the averaging to these device in
round-robin. Cannot be used together with ``colocation``. round-robin. Cannot be used together with ``colocation``.
average (bool): do average or sum
Returns: Returns:
(N x 2): A list of N (grad, var) tuples, where grad is averaged over K. (N x 2): A list of N (grad, var) tuples, where grad is averaged over K.
...@@ -154,6 +155,13 @@ def average_grads(all_grads, colocation=True, devices=None): ...@@ -154,6 +155,13 @@ def average_grads(all_grads, colocation=True, devices=None):
nr_tower = len(all_grads) nr_tower = len(all_grads)
if nr_tower == 1: if nr_tower == 1:
return all_grads[0] return all_grads[0]
def aggregate(grads):
if average:
return tf.multiply(tf.add_n(grads), 1.0 / nr_tower)
else:
return tf.add_n(grads)
ret = [] ret = []
with tf.name_scope('AvgGrad'): with tf.name_scope('AvgGrad'):
for idx, grad_and_vars in enumerate(zip(*all_grads)): for idx, grad_and_vars in enumerate(zip(*all_grads)):
...@@ -163,16 +171,13 @@ def average_grads(all_grads, colocation=True, devices=None): ...@@ -163,16 +171,13 @@ def average_grads(all_grads, colocation=True, devices=None):
if colocation: if colocation:
with tf.device(v.device): # colocate summed grad with var with tf.device(v.device): # colocate summed grad with var
grad = tf.multiply( grad = aggregate(grads)
tf.add_n(grads), 1.0 / nr_tower)
elif devices is None: elif devices is None:
grad = tf.multiply( grad = aggregate(grads)
tf.add_n(grads), 1.0 / nr_tower)
else: else:
dev = devices[idx % len(devices)] dev = devices[idx % len(devices)]
with tf.device(dev): with tf.device(dev):
grad = tf.multiply( grad = aggregate(grads)
tf.add_n(grads), 1.0 / nr_tower)
ret.append((grad, v)) ret.append((grad, v))
return ret return ret
......
...@@ -58,13 +58,7 @@ def get_global_step_var(): ...@@ -58,13 +58,7 @@ def get_global_step_var():
""" """
scope = tf.VariableScope(reuse=False, name='') # the root vs scope = tf.VariableScope(reuse=False, name='') # the root vs
with tf.variable_scope(scope): with tf.variable_scope(scope):
if get_tf_version_number() <= 1.0: var = tf.train.get_or_create_global_step()
var = tf.get_variable('global_step',
initializer=tf.constant(0, dtype=tf.int64),
trainable=False, dtype=tf.int64)
tf.add_to_collection(tf.GraphKeys.GLOBAL_STEP, var)
else:
var = tf.train.get_or_create_global_step()
return var return var
......
...@@ -143,6 +143,7 @@ class SyncMultiGPUTrainerReplicated(SingleCostTrainer): ...@@ -143,6 +143,7 @@ class SyncMultiGPUTrainerReplicated(SingleCostTrainer):
Args: Args:
gpus (int or [int]): list of GPU ids. gpus (int or [int]): list of GPU ids.
average (bool): whether to average or sum gradients. average (bool): whether to average or sum gradients.
use_nccl (bool): use NCCL or TensorFlow copy to reduce.
""" """
self.devices = gpus self.devices = gpus
self._builder = SyncMultiGPUReplicatedBuilder(gpus, average, use_nccl) self._builder = SyncMultiGPUReplicatedBuilder(gpus, average, use_nccl)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment