Commit 1d99dc4e authored by Yuxin Wu's avatar Yuxin Wu

organize name scopes in trainers

parent 5f750f13
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
import tensorflow as tf import tensorflow as tf
import re import re
from six.moves import zip, range from six.moves import range
from ..utils.argtools import memoized from ..utils.argtools import memoized
from ..tfutils.common import get_op_tensor_name, get_global_step_var from ..tfutils.common import get_op_tensor_name, get_global_step_var
...@@ -194,32 +194,6 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase): ...@@ -194,32 +194,6 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
# Device for queues for managing synchronization between servers # Device for queues for managing synchronization between servers
self.sync_queue_devices = ['/job:ps/task:%s/cpu:0' % i for i in range(self.num_ps)] self.sync_queue_devices = ['/job:ps/task:%s/cpu:0' % i for i in range(self.num_ps)]
@staticmethod
def _average_grads(tower_grads, devices):
"""
Average grads from towers.
The device where the average happens is chosen with round-robin.
Args:
tower_grads: Ngpu x Nvar x 2
Returns:
Nvar x 2
"""
nr_device = len(devices)
if nr_device == 1:
return tower_grads[0]
new_tower_grads = []
with tf.name_scope('AvgGrad'):
for i, grad_and_vars in enumerate(zip(*tower_grads)):
v = grad_and_vars[0][1] # Ngpu * 2
all_grads = [g for (g, _) in grad_and_vars]
with tf.device(devices[i % nr_device]):
grad = tf.multiply(
tf.add_n(all_grads), 1.0 / nr_device)
new_tower_grads.append((grad, v))
return new_tower_grads
@staticmethod @staticmethod
def _apply_shadow_vars(avg_grads): def _apply_shadow_vars(avg_grads):
""" """
...@@ -298,7 +272,7 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase): ...@@ -298,7 +272,7 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
use_vs=[True] * len(self.towers)) # open vs at each tower use_vs=[True] * len(self.towers)) # open vs at each tower
DataParallelBuilder._check_grad_list(grad_list) DataParallelBuilder._check_grad_list(grad_list)
avg_grads = DistributedReplicatedBuilder._average_grads(grad_list, self.raw_devices) avg_grads = average_grads(grad_list, devices=self.raw_devices)
with tf.device(self.param_server_device): with tf.device(self.param_server_device):
ps_var_grads = DistributedReplicatedBuilder._apply_shadow_vars(avg_grads) ps_var_grads = DistributedReplicatedBuilder._apply_shadow_vars(avg_grads)
var_update_ops = self._apply_gradients_and_copy( var_update_ops = self._apply_gradients_and_copy(
...@@ -312,9 +286,11 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase): ...@@ -312,9 +286,11 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
'post_copy_barrier', [main_fetch]) 'post_copy_barrier', [main_fetch])
# initial local_vars syncing # initial local_vars syncing
initial_sync_op = self._get_initial_sync_op() with tf.name_scope('initial_sync_variables'):
initial_sync_op = self._get_initial_sync_op()
if len(self._shadow_model_vars) and self.is_chief: if len(self._shadow_model_vars) and self.is_chief:
model_sync_op = self._get_sync_model_vars_op() with tf.name_scope('sync_model_variables'):
model_sync_op = self._get_sync_model_vars_op()
else: else:
model_sync_op = None model_sync_op = None
return train_op, initial_sync_op, model_sync_op return train_op, initial_sync_op, model_sync_op
...@@ -332,19 +308,20 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase): ...@@ -332,19 +308,20 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
list of copy ops list of copy ops
""" """
# TODO do this for variables together? # TODO do this for variables together?
var_update_ops = [] with tf.name_scope('apply_gradients'):
for vid, (g, v) in enumerate(ps_var_grads): var_update_ops = []
# TODO do we put momentum variables into local or global? for vid, (g, v) in enumerate(ps_var_grads):
apply_gradient_op = opt.apply_gradients([(g, v)]) # TODO do we put momentum variables into local or global?
barrier = self._add_sync_queues_and_barrier( apply_gradient_op = opt.apply_gradients([(g, v)])
'param_update_barrier_{}'.format(vid), [apply_gradient_op]) barrier = self._add_sync_queues_and_barrier(
with tf.control_dependencies([barrier]), \ 'param_update_barrier_{}'.format(vid), [apply_gradient_op])
tf.device(self.cpu_device): with tf.control_dependencies([barrier]), \
updated_value = v.read_value() tf.device(self.cpu_device):
for towerid in range(self.nr_gpu): updated_value = v.read_value()
var_update_ops.append( for towerid in range(self.nr_gpu):
raw_grad_list[towerid][vid][1].assign(updated_value)) var_update_ops.append(
return var_update_ops raw_grad_list[towerid][vid][1].assign(updated_value))
return var_update_ops
def _get_initial_sync_op(self): def _get_initial_sync_op(self):
""" """
......
...@@ -218,15 +218,17 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder): ...@@ -218,15 +218,17 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
train_ops = [] train_ops = []
opt = get_opt_fn() opt = get_opt_fn()
for idx, grad_and_vars in enumerate(grads): with tf.name_scope('apply_gradients'):
with tf.device(raw_devices[idx]): for idx, grad_and_vars in enumerate(grads):
# apply_gradients may create variables. Make them LOCAL_VARIABLES with tf.device(raw_devices[idx]):
with override_to_local_variable(enable=idx > 0): # apply_gradients may create variables. Make them LOCAL_VARIABLES
train_ops.append(opt.apply_gradients( with override_to_local_variable(enable=idx > 0):
grad_and_vars, name='apply_grad_{}'.format(idx))) train_ops.append(opt.apply_gradients(
grad_and_vars, name='apply_grad_{}'.format(idx)))
train_op = tf.group(*train_ops, name='train_op') train_op = tf.group(*train_ops, name='train_op')
post_init_op = SyncMultiGPUReplicatedBuilder.get_post_init_ops() with tf.name_scope('sync_variables'):
post_init_op = SyncMultiGPUReplicatedBuilder.get_post_init_ops()
return train_op, post_init_op return train_op, post_init_op
# Adopt from https://github.com/tensorflow/benchmarks/blob/master/scripts/tf_cnn_benchmarks/variable_mgr.py # Adopt from https://github.com/tensorflow/benchmarks/blob/master/scripts/tf_cnn_benchmarks/variable_mgr.py
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment