changes in tower to allow replicated training

ca16fb7e · Yuxin Wu · 118c2a26 · ca16fb7e · ca16fb7e · ca16fb7e
Commit ca16fb7e authored May 10, 2017 by Yuxin Wu
6 changed files
--- a/examples/DoReFa-Net/README.md
+++ b/examples/DoReFa-Net/README.md
@@ -20,7 +20,7 @@ Alternative link to this page: [http://dorefa.net](http://dorefa.net)

 To use the script. You'll need:

-+ TensorFlow >= 1.0.0rc0
+ TensorFlow >= 1.0.0 (>=1.1 for MultiGPU)

 + OpenCV bindings for Python


--- a/tensorpack/models/regularize.py
+++ b/tensorpack/models/regularize.py
@@ -38,12 +38,16 @@ def regularize_cost(regex, func, name='regularize_cost'):

            cost = cost + regularize_cost("fc.*/W", l2_regularizer(1e-5))
    """
+    ctx = get_current_tower_context()
    G = tf.get_default_graph()
    params = G.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

    costs = []
    for p in params:
        para_name = p.name
+        # in replicated mode, only regularize variables inside this tower
+        if ctx.has_own_variables and (not para_name.startswith(ctx.name)):
+            continue
        if re.search(regex, para_name):
            costs.append(func(p))
            _log_regularizer(para_name)

--- a/tensorpack/tfutils/model_utils.py
+++ b/tensorpack/tfutils/model_utils.py
@@ -6,6 +6,7 @@ import tensorflow as tf
 from termcolor import colored
 from tabulate import tabulate

+from ..tfutils.tower import get_current_tower_context
 from ..utils import logger
 from .summary import add_moving_summary

@@ -62,7 +63,9 @@ def apply_slim_collections(cost):
        a scalar tensor, the cost after applying the collections.
    """
    regulization_losses = set(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+    ctx = get_current_tower_context()
    if len(regulization_losses) > 0:
+        assert not ctx.has_own_variables, "REGULARIZATION_LOSSES collection doesn't work in replicated mode!"
        logger.info("Applying REGULARIZATION_LOSSES on cost.")
        reg_loss = tf.add_n(list(regulization_losses), name="regularize_loss")
        cost = tf.add(reg_loss, cost, name='total_cost')

--- a/tensorpack/tfutils/tower.py
+++ b/tensorpack/tfutils/tower.py
@@ -15,12 +15,15 @@ _CurrentTowerContext = None
 class TowerContext(object):
    """ A context where the current model is being built in. """

-    def __init__(self, tower_name, device=None, is_training=None):
+    def __init__(self, tower_name,
+                 device=None, is_training=None,
+                 var_strategy='shared'):
        """
        Args:
            tower_name (str): 'tower0', 'towerp0', or ''
            device (str or device function): the device to use. Defaults to either cpu0 or gpu0.
            is_training (bool): if None, automatically determine from tower_name.
+            var_strategy (str): either 'shared' or 'replicated'.
        """
        self._name = tower_name
        if device is None:
@@ -31,6 +34,11 @@ class TowerContext(object):
            is_training = not self._name.startswith(PREDICT_TOWER)
        self._is_training = is_training

+        assert var_strategy in ['replicated', 'shared'], var_strategy
+        self._var_strategy = var_strategy
+        if self._var_strategy == 'replicated':
+            assert self._name
+
    @property
    def is_main_training_tower(self):
        return self.is_training and (self._name == '' or self._name == 'tower0')
@@ -43,6 +51,10 @@ class TowerContext(object):
    def is_training(self):
        return self._is_training

+    @property
+    def has_own_variables(self):
+        return self._var_strategy == 'replicated'
+
    @property
    def name(self):
        return self._name
@@ -88,18 +100,25 @@ class TowerContext(object):
        assert _CurrentTowerContext is None, \
            "Nesting TowerContext!"
        _CurrentTowerContext = self
+        self._ctxs = []
        if len(self._name):
-            self._scope_ctx = tf.name_scope(self._name)
-            self._scope_ctx.__enter__()
-        self._device_ctx = tf.device(self._device)
-        self._device_ctx.__enter__()
+            if self.has_own_variables:
+                # open new variable scopes
+                self._ctxs.append(tf.variable_scope(self._name))
+            else:
+                # use existing variable scope
+                self._ctxs.append(tf.variable_scope(
+                    tf.get_variable_scope(), reuse=self.index > 0))
+                self._ctxs.append(tf.name_scope(self._name))
+        self._ctxs.append(tf.device(self._device))
+        for c in self._ctxs:
+            c.__enter__()

    def __exit__(self, exc_type, exc_val, exc_tb):
        global _CurrentTowerContext
        _CurrentTowerContext = None
-        if len(self._name):
-            self._scope_ctx.__exit__(exc_type, exc_val, exc_tb)
-        self._device_ctx.__exit__(exc_type, exc_val, exc_tb)
+        for c in self._ctxs[::-1]:
+            c.__exit__(exc_type, exc_val, exc_tb)
        return False

    def __str__(self):

--- a/tensorpack/train/multigpu.py
+++ b/tensorpack/train/multigpu.py
@@ -27,12 +27,13 @@ __all__ = ['MultiGPUTrainerBase', 'SyncMultiGPUTrainer',
 class MultiGPUTrainerBase(Trainer):
    """ Base class for multi-gpu training"""
    @staticmethod
-    def build_on_multi_tower(towers, func, devices=None):
+    def build_on_multi_tower(towers, func, devices=None, var_strategy='shared'):
        """
        Args:
            towers: list of gpu relative ids
            func: a lambda to be called inside each tower
            devices: a list of devices to be used. By default will use GPUs in towers.
+            var_strategy (str):

        Returns:
            List of outputs of ``func``, evaluated on each tower.
@@ -40,17 +41,19 @@ class MultiGPUTrainerBase(Trainer):
        logger.info("Training a model of {} tower".format(len(towers)))

        ret = []
-        global_scope = tf.get_variable_scope()
        if devices is not None:
            assert len(devices) == len(towers)
+
        for idx, t in enumerate(towers):
            device = devices[idx] if devices is not None else '/gpu:{}'.format(t)
-            with tf.variable_scope(global_scope, reuse=idx > 0), \
-                TowerContext(
+            with TowerContext(
                    'tower{}'.format(idx),
-                    device=device,
-                    is_training=True):
-                logger.info("Building graph for training tower {}...".format(idx))
+                    device=device, is_training=True,
+                    var_strategy=var_strategy):
+                if idx == t:
+                    logger.info("Building graph for training tower {}...".format(idx))
+                else:
+                    logger.info("Building graph for training tower {} on device {}...".format(idx, t))

                ret.append(func())

@@ -92,14 +95,15 @@ class LeastLoadedDeviceSetter(object):
 class SyncMultiGPUTrainerParameterServer(MultiGPUTrainerBase, SingleCostFeedfreeTrainer):
    """
    A multi-tower multi-GPU trainer which synchronoizes the gradients computed
-    from each tower, averages them and update to variables stored on PS.
+    from each tower, averages them and update to variables stored across all
+    GPUs or on CPU.
    """

    def __init__(self, config, ps_device='gpu'):
        """
        Args:
            config: same as in :class:`QueueInputTrainer`.
-            ps_device: either 'gpu' or 'cpu'
+            ps_device: either 'gpu' or 'cpu', where variables are stored.
        """
        if config.dataflow is not None:
            # use queueinput by default. May need to avoid this in the future (when more input type is available)

--- a/tensorpack/utils/loadcaffe.py
+++ b/tensorpack/utils/loadcaffe.py
@@ -123,9 +123,11 @@ def get_caffe_pb():
    if not os.path.isfile(caffe_pb_file):
        download(CAFFE_PROTO_URL, dir)
        assert os.path.isfile(os.path.join(dir, 'caffe.proto'))
-        ret = os.system('cd {} && protoc caffe.proto --python_out .'.format(dir))
+        cmd = 'cd {} && protoc caffe.proto --python_out .'.format(dir)
+        ret = os.system(cmd)
        assert ret == 0, \
-            "Command `protoc caffe.proto --python_out .` failed!"
+            "Command `{}` failed!".format(cmd)
+        assert os.path.isfile(caffe_pb_file), caffe_pb_file
    import imp
    return imp.load_source('caffepb', caffe_pb_file)