Commit e1a879d7 authored by Yuxin Wu's avatar Yuxin Wu

Make "builder.grads" visible to trainers.

parent 9bbdf94d
...@@ -8,10 +8,6 @@ cache: ...@@ -8,10 +8,6 @@ cache:
directories: directories:
- $HOME/tensorpack_data - $HOME/tensorpack_data
env:
global:
- TF_VERSION=1.5.0
addons: addons:
apt: apt:
packages: packages:
...@@ -24,10 +20,16 @@ matrix: ...@@ -24,10 +20,16 @@ matrix:
include: include:
- os: linux - os: linux
python: 2.7 python: 2.7
env: TF_TYPE=release env: TF_VERSION=1.3.0 TF_TYPE=release
- os: linux
python: 3.5
env: TF_VERSION=1.3.0 TF_TYPE=release
- os: linux
python: 2.7
env: TF_VERSION=1.5.0 TF_TYPE=release
- os: linux - os: linux
python: 3.5 python: 3.5
env: TF_TYPE=release env: TF_VERSION=1.5.0 TF_TYPE=release
- os: linux - os: linux
python: 2.7 python: 2.7
env: TF_VERSION=1.head TF_TYPE=nightly env: TF_VERSION=1.head TF_TYPE=nightly
......
...@@ -123,6 +123,9 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder): ...@@ -123,6 +123,9 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder):
It is an equivalent of ``--variable_update=parameter_server`` in It is an equivalent of ``--variable_update=parameter_server`` in
`tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`_. `tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`_.
Attribute:
grads: list of (g, v). Averaged gradients, available after build()
""" """
def __init__(self, towers, ps_device): def __init__(self, towers, ps_device):
""" """
...@@ -158,15 +161,15 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder): ...@@ -158,15 +161,15 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder):
# self.train_op = tf.group(*ops) # self.train_op = tf.group(*ops)
# return # return
grads = average_grads(grad_list, colocation=True) self.grads = average_grads(grad_list, colocation=True)
# grads = grad_list[0] # grads = grad_list[0]
opt = get_opt_fn() opt = get_opt_fn()
if self.ps_device == 'cpu': if self.ps_device == 'cpu':
with tf.device('/cpu:0'): with tf.device('/cpu:0'):
train_op = opt.apply_gradients(grads, name='train_op') train_op = opt.apply_gradients(self.grads, name='train_op')
else: else:
train_op = opt.apply_gradients(grads, name='train_op') train_op = opt.apply_gradients(self.grads, name='train_op')
return train_op return train_op
...@@ -179,11 +182,16 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder): ...@@ -179,11 +182,16 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
It is an equivalent of ``--variable_update=replicated`` in It is an equivalent of ``--variable_update=replicated`` in
`tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`_. `tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`_.
Attribute:
grads: #GPU number of lists of (g, v). Synchronized gradients on each device, available after build()
Though on different deviecs, they should contain the same value.
""" """
def __init__(self, towers, average): def __init__(self, towers, average, use_nccl):
super(SyncMultiGPUReplicatedBuilder, self).__init__(towers) super(SyncMultiGPUReplicatedBuilder, self).__init__(towers)
self._average = average self._average = average
self._use_nccl = use_nccl
def build(self, get_grad_fn, get_opt_fn): def build(self, get_grad_fn, get_opt_fn):
""" """
...@@ -210,20 +218,20 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder): ...@@ -210,20 +218,20 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
DataParallelBuilder._check_grad_list(grad_list) DataParallelBuilder._check_grad_list(grad_list)
if True: if self._use_nccl:
grads = allreduce_grads(grad_list, average=self._average) # #gpu x #param x 2 self.grads = allreduce_grads(grad_list, average=self._average) # #gpu x #param x 2
else: else:
agg_grad_and_vars = average_grads(grad_list, colocation=False, devices=['/cpu:0']) # #param x 2 agg_grad_and_vars = average_grads(grad_list, colocation=False, devices=['/cpu:0']) # #param x 2
grads = [] # #gpu x #param x 2 self.grads = [] # #gpu x #param x 2
for grad_and_vars in grad_list: # grad_and_vars: #paramx2 for grad_and_vars in grad_list: # grad_and_vars: #paramx2
# take v from each tower, and g from average. # take v from each tower, and g from average.
grads.append( self.grads.append(
[(g, v) for (_, v), (g, _) in zip(grad_and_vars, agg_grad_and_vars)]) [(g, v) for (_, v), (g, _) in zip(grad_and_vars, agg_grad_and_vars)])
train_ops = [] train_ops = []
opt = get_opt_fn() opt = get_opt_fn()
with tf.name_scope('apply_gradients'): with tf.name_scope('apply_gradients'):
for idx, grad_and_vars in enumerate(grads): for idx, grad_and_vars in enumerate(self.grads):
with tf.device(raw_devices[idx]): with tf.device(raw_devices[idx]):
# apply_gradients may create variables. Make them LOCAL_VARIABLES # apply_gradients may create variables. Make them LOCAL_VARIABLES
with override_to_local_variable(enable=idx > 0): with override_to_local_variable(enable=idx > 0):
......
...@@ -138,14 +138,14 @@ class SyncMultiGPUTrainerReplicated(SingleCostTrainer): ...@@ -138,14 +138,14 @@ class SyncMultiGPUTrainerReplicated(SingleCostTrainer):
""" """
@map_arg(gpus=_int_to_range) @map_arg(gpus=_int_to_range)
def __init__(self, gpus, average=True): def __init__(self, gpus, average=True, use_nccl=True):
""" """
Args: Args:
gpus (int or [int]): list of GPU ids. gpus (int or [int]): list of GPU ids.
average (bool): whether to average or sum gradients. average (bool): whether to average or sum gradients.
""" """
self.devices = gpus self.devices = gpus
self._builder = SyncMultiGPUReplicatedBuilder(gpus, average) self._builder = SyncMultiGPUReplicatedBuilder(gpus, average, use_nccl)
super(SyncMultiGPUTrainerReplicated, self).__init__() super(SyncMultiGPUTrainerReplicated, self).__init__()
def _setup_graph(self, input, get_cost_fn, get_opt_fn): def _setup_graph(self, input, get_cost_fn, get_opt_fn):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment