Docs about graph builder

490142d7 · Yuxin Wu · a2f60395 · 490142d7 · 490142d7 · 490142d7
Commit 490142d7 authored Oct 17, 2017 by Yuxin Wu
6 changed files
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -361,7 +361,6 @@ def autodoc_skip_member(app, what, name, obj, skip, options):
        'Triggerable',
        'predictor_factory',
        'get_predictors',
-        'vs_name_for_predictor',
        'RandomCropAroundBox',
        'GaussianDeform',
        'dump_chkpt_vars',

--- a/tensorpack/graph_builder/distributed.py
+++ b/tensorpack/graph_builder/distributed.py
@@ -7,9 +7,7 @@ import re
 from six.moves import zip, range

 from ..utils.argtools import memoized
-from ..tfutils.gradproc import FilterNoneGrad
 from ..tfutils.common import get_global_step_var, get_op_tensor_name
-from ..tfutils.tower import get_current_tower_context

 from .training import DataParallelBuilder

@@ -17,8 +15,28 @@ __all__ = ['DistributedReplicatedBuilder']


 class DistributedReplicatedBuilder(DataParallelBuilder):
+    """
+    Graph builder for distributed replicated training.
+    Each worker process builds the same model on one or more GPUs.
+    Gradients across GPUs are averaged within the worker,
+    and get synchronously applied to the global copy of variables located on PS.
+    Then each worker copy the latest variables from PS back to local.
+
+    See https://www.tensorflow.org/performance/benchmarks for details.
+
+    Note:
+        Gradients are not averaged across workers, but applied to PS variables
+        directly (either with or without locking depending on the optimizer).
+    """

    def __init__(self, towers, server):
+        """
+        Args:
+            towers (list[int]): list of GPU ids.
+            server (tf.train.Server): the server with ps and workers.
+                The job_name must be 'worker' because 'ps' job doesn't need to
+                build any graph.
+        """
        super(DistributedReplicatedBuilder, self).__init__(towers)
        self.server = server
        server_def = server.server_def
@@ -146,6 +164,20 @@ class DistributedReplicatedBuilder(DataParallelBuilder):
            return tf.group(*queue_ops, name=name)

    def build(self, input, get_cost_fn, get_opt_fn):
+        """
+        Args:
+            input (InputSource): the input. Should have been setup.
+            get_cost_fn ([tf.Tensor] -> tf.Tensor): callable which takes a list of input tensor
+                and returns a cost tensor
+            get_opt_fn (-> tf.train.Optimizer): callable which returns an optimizer
+
+        Returns:
+            tf.Operation: the training op
+            tf.Operation: the op which sync all the local variables from PS.
+                This op sholud be run before training.
+            tf.Operation: the op which sync all the local `MODEL_VARIABLES` from PS.
+                You can choose how often to run it by yourself.
+        """
        # do this before everything, because they my need global step
        with tf.device(self.param_server_device):
            gs = get_global_step_var()
@@ -156,21 +188,11 @@ class DistributedReplicatedBuilder(DataParallelBuilder):
        # This makes sure that learning_rate is a global variable (what we expect)
        get_opt_fn()

-        def get_grads():
-            ctx = get_current_tower_context()
-            cost = get_cost_fn(*input.get_input_tensors())
-
-            varlist = ctx.filter_vars_by_vs_name(tf.trainable_variables())
-            opt = get_opt_fn()
-            grads = opt.compute_gradients(
-                cost, var_list=varlist,
-                gate_gradients=False, colocate_gradients_with_ops=True)
-            grads = FilterNoneGrad().process(grads)
-            return grads
+        get_grad_fn, _ = DataParallelBuilder._make_fn(input, get_cost_fn, get_opt_fn)

        # Ngpu * Nvar * 2
        grad_list = DataParallelBuilder.build_on_towers(
-            self.towers, get_grads,
+            self.towers, get_grad_fn,
            devices=self.raw_devices,
            use_vs=[True] * len(self.towers))  # open vs at each tower
        DataParallelBuilder._check_grad_list(grad_list)
@@ -180,7 +202,7 @@ class DistributedReplicatedBuilder(DataParallelBuilder):
            ps_var_grads = DistributedReplicatedBuilder._apply_shadow_vars(avg_grads)
            var_update_ops = self._apply_gradients_and_copy(
                get_opt_fn(), grad_list, ps_var_grads)
-            self._shadow_vars = [v for (_, v) in ps_var_grads]
+            self._shadow_vars = [v for (__, v) in ps_var_grads]
            self._shadow_model_vars = DistributedReplicatedBuilder._shadow_model_variables(self._shadow_vars)

        # TODO add options to synchronize less

--- a/tensorpack/graph_builder/training.py
+++ b/tensorpack/graph_builder/training.py
@@ -37,10 +37,10 @@ class SimpleBuilder(GraphBuilder):
    def build(self, input, get_cost_fn, get_opt_fn):
        """
        Args:
-            input (InputSource): should have been setup already
-            get_cost_fn ([tf.Tensor] -> tf.Tensor): a callable,
-                taking several tensors as input and returns a cost tensor.
-            get_opt_fn (None -> tf.train.Optimizer): a callable that returns an optimizer
+            input (InputSource): the input. Should have been setup.
+            get_cost_fn ([tf.Tensor] -> tf.Tensor): callable which takes a list of input tensor
+                and returns a cost tensor
+            get_opt_fn (-> tf.train.Optimizer): callable which returns an optimizer

        Returns:
            tf.Operation: the training op
@@ -62,7 +62,7 @@ class DataParallelBuilder(GraphBuilder):
    def __init__(self, towers):
        """
        Args:
-            towers(list[int]): list of GPU relative ids.
+            towers(list[int]): list of GPU ids.
        """
        if len(towers) > 1:
            logger.info("Training a model of {} towers".format(len(towers)))
@@ -88,11 +88,12 @@ class DataParallelBuilder(GraphBuilder):
    def build_on_towers(
            towers, func, devices=None, use_vs=None):
        """
-        Run `func` on all towers.
+        Run `func` on all GPUs (towers) and return the results.

        Args:
+            towers (list[int]): a list of GPU id.
            func: a lambda to be called inside each tower
-            devices: a list of devices to be used. By default will use GPUs in ``towers``.
+            devices: a list of devices to be used. By default will use '/gpu:{tower}'
            use_vs (list[bool]): list of use_vs to passed to TowerContext

        Returns:
@@ -115,10 +116,7 @@ class DataParallelBuilder(GraphBuilder):
                    is_training=True,
                    index=idx,
                    use_vs=usevs):
-                if idx == t:
-                    logger.info("Building graph for training tower {}...".format(idx))
-                else:
-                    logger.info("Building graph for training tower {} on device {}...".format(idx, device))
+                logger.info("Building graph for training tower {} on device {}...".format(idx, device))

                # When use_vs is True, use LOCAL_VARIABLES,
                # so these duplicated variables won't be saved by default.
@@ -131,11 +129,46 @@ class DataParallelBuilder(GraphBuilder):
        restore_collection(backup)
        return ret

+    @staticmethod
+    def _make_fn(input, get_cost_fn, get_opt_fn):
+        # internal use only
+        get_opt_fn = memoized(get_opt_fn)
+
+        def get_grad_fn():
+            ctx = get_current_tower_context()
+            cost = get_cost_fn(*input.get_input_tensors())
+
+            varlist = ctx.filter_vars_by_vs_name(tf.trainable_variables())
+            opt = get_opt_fn()
+            grads = opt.compute_gradients(
+                cost, var_list=varlist,
+                gate_gradients=False, colocate_gradients_with_ops=True)
+            grads = FilterNoneGrad().process(grads)
+            return grads
+
+        return get_grad_fn, get_opt_fn
+

 class SyncMultiGPUParameterServerBuilder(DataParallelBuilder):
-    def __init__(self, towers, ps_device):
+    """
+    Graph builder for data-parallel training in 'ParameterServer' mode.
+    It builds one tower on each GPU with
+    shared variable scope. It synchronoizes the gradients computed
+    from each tower, averages them and applies to the shared variables.
+
+    See https://www.tensorflow.org/performance/benchmarks for details.
+    """
+    def __init__(self, towers, ps_device=None):
+        """
+        Args:
+            towers(list[int]): list of GPU id
+            ps_device (str): either 'gpu' or 'cpu', where variables are stored.
+                Setting to 'cpu' might help when #gpu>=4
+        """
        super(SyncMultiGPUParameterServerBuilder, self).__init__(towers)
-        # TODO auto choose ps_device
+        if ps_device is None:
+            ps_device = 'cpu' if len(towers) >= 4 else 'gpu'
+        assert ps_device in ['cpu', 'gpu']
        self.ps_device = ps_device

    @staticmethod
@@ -158,6 +191,16 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder):
        return new_tower_grads

    def build(self, input, get_cost_fn, get_opt_fn):
+        """
+        Args:
+            input (InputSource):
+            get_cost_fn ([tf.Tensor] -> tf.Tensor): callable which takes a list of input tensor
+                and returns a cost tensor
+            get_opt_fn (-> tf.train.Optimizer): callable which returns an optimizer
+
+        Returns:
+            tf.Operation: the training op
+        """
        raw_devices = ['/gpu:{}'.format(k) for k in self.towers]
        if self.ps_device == 'gpu':
            devices = [LeastLoadedDeviceSetter(d, raw_devices) for d in raw_devices]
@@ -165,22 +208,9 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder):
            devices = [tf.train.replica_device_setter(
                worker_device=d, ps_device='/cpu:0', ps_tasks=1) for d in raw_devices]

-        # TODO XXX share this part of code
-        get_opt_fn = memoized(get_opt_fn)
+        get_grad_fn, get_opt_fn = DataParallelBuilder._make_fn(input, get_cost_fn, get_opt_fn)

-        def get_grads():
-            ctx = get_current_tower_context()
-            cost = get_cost_fn(*input.get_input_tensors())
-
-            varlist = ctx.filter_vars_by_vs_name(tf.trainable_variables())
-            opt = get_opt_fn()
-            grads = opt.compute_gradients(
-                cost, var_list=varlist,
-                gate_gradients=False, colocate_gradients_with_ops=True)
-            grads = FilterNoneGrad().process(grads)
-            return grads
-
-        grad_list = DataParallelBuilder.build_on_towers(self.towers, get_grads, devices)
+        grad_list = DataParallelBuilder.build_on_towers(self.towers, get_grad_fn, devices)
        DataParallelBuilder._check_grad_list(grad_list)

        # debug tower performance (without update):
@@ -201,6 +231,14 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder):


 class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
+    """
+    Graph builder for data-parallel training in "replicated" mode,
+    where each GPU contains a replicate of the whole model.
+    It will build one tower on each GPU under its own variable scope.
+    Each gradient update is averaged across or GPUs through NCCL.
+
+    See https://www.tensorflow.org/performance/benchmarks for details.
+    """
    @staticmethod
    def _allreduce_grads(tower_grads):
        from tensorflow.contrib import nccl
@@ -224,25 +262,27 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
        return new_tower_grads

    def build(self, input, get_cost_fn, get_opt_fn):
-        raw_devices = ['/gpu:{}'.format(k) for k in self.towers]
+        """
+        Args:
+            input (InputSource): the input. Should have been setup.
+            get_cost_fn ([tf.Tensor] -> tf.Tensor): callable which takes a list of input tensor
+                and returns a cost tensor
+            get_opt_fn (-> tf.train.Optimizer): callable which returns an optimizer

-        get_opt_fn = memoized(get_opt_fn)
+        Returns:
+            tf.Operation: the training op.
+            tf.Operation: the op which sync variables from GPU 0 to other GPUs.
+                It has to be run before the training has started.
+                And you can optionally run it later to sync non-trainable variables.
+        """
+        raw_devices = ['/gpu:{}'.format(k) for k in self.towers]

-        def get_grads():
-            ctx = get_current_tower_context()
-            cost = get_cost_fn(*input.get_input_tensors())
-
-            varlist = ctx.filter_vars_by_vs_name(tf.trainable_variables())
-            opt = get_opt_fn()
-            grads = opt.compute_gradients(
-                cost, var_list=varlist,
-                gate_gradients=False, colocate_gradients_with_ops=True)
-            grads = FilterNoneGrad().process(grads)
-            return grads
+        get_grad_fn, get_opt_fn = DataParallelBuilder._make_fn(input, get_cost_fn, get_opt_fn)

        grad_list = DataParallelBuilder.build_on_towers(
            self.towers,
-            get_grads,  # use no variable scope for the first tower
+            get_grad_fn,
+            # use no variable scope for the first tower
            use_vs=[False] + [True] * (len(self.towers) - 1))
        grads = SyncMultiGPUReplicatedBuilder._allreduce_grads(grad_list)

@@ -292,11 +332,33 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):


 class AsyncMultiGPUBuilder(DataParallelBuilder):
+    """
+    Graph builder for data-parallel training with async update.
+    It builds one tower on each GPU with shared variable scope.
+    Every tower computes the gradients and independently applies them to the
+    variables, without synchronizing and averaging across towers.
+    """
+
    def __init__(self, towers, scale_gradient=True):
+        """
+        Args:
+            towers(list[int]): list of GPU ids.
+            scale_gradient (bool): if True, will scale each gradient by ``1.0/nr_gpu``.
+        """
        super(AsyncMultiGPUBuilder, self).__init__(towers)
        self._scale_gradient = scale_gradient

    def build(self, input, get_cost_fn, get_opt_fn):
+        """
+        Args:
+            input (InputSource): the input. Should have been setup.
+            get_cost_fn ([tf.Tensor] -> tf.Tensor): callable which takes a list of input tensor
+                and returns a cost tensor
+            get_opt_fn (-> tf.train.Optimizer): callable which returns an optimizer
+
+        Returns:
+            tf.Operation: the training op
+        """
        ps_device = 'cpu' if len(self.towers) >= 4 else 'gpu'

        if ps_device == 'gpu':
@@ -306,21 +368,9 @@ class AsyncMultiGPUBuilder(DataParallelBuilder):
            devices = [tf.train.replica_device_setter(
                worker_device=d, ps_device='/cpu:0', ps_tasks=1) for d in raw_devices]

-        get_opt_fn = memoized(get_opt_fn)
-
-        def get_grads():
-            ctx = get_current_tower_context()
-            cost = get_cost_fn(*input.get_input_tensors())
-
-            varlist = ctx.filter_vars_by_vs_name(tf.trainable_variables())
-            opt = get_opt_fn()
-            grads = opt.compute_gradients(
-                cost, var_list=varlist,
-                gate_gradients=False, colocate_gradients_with_ops=True)
-            grads = FilterNoneGrad().process(grads)
-            return grads
+        get_grad_fn, get_opt_fn = DataParallelBuilder._make_fn(input, get_cost_fn, get_opt_fn)

-        grad_list = DataParallelBuilder.build_on_towers(self.towers, get_grads, devices)
+        grad_list = DataParallelBuilder.build_on_towers(self.towers, get_grad_fn, devices)
        DataParallelBuilder._check_grad_list(grad_list)

        if self._scale_gradient and len(self.towers) > 1:

--- a/tensorpack/train/base.py
+++ b/tensorpack/train/base.py
@@ -253,14 +253,6 @@ class Trainer(object):
                self._callbacks.after_train()
                self.hooked_sess.close()

-    # Predictor related methods. They actually should not be part of a trainer:
-    @property
-    def vs_name_for_predictor(self):
-        # The variable scope name a predictor should be built in.
-        # Expected to be changed. Don't use it.
-        # TODO graphbuilder knows it
-        return ""
-
    def get_predictor(self, input_names, output_names, tower=0):
        """
        Returns a callable predictor built under ``is_training=False`` tower context.

--- a/tensorpack/train/distributed.py
+++ b/tensorpack/train/distributed.py
@@ -20,13 +20,7 @@ __all__ = ['DistributedTrainerReplicated']

 class DistributedTrainerReplicated(Trainer):
    """
-    Distributed replicated training.
-    Each worker process builds the same model on one or more GPUs.
-    Gradients across GPUs are averaged within the worker,
-    and get synchronously applied to the global copy of variables located on PS.
-    Then each worker copy the latest variables from PS back to local.
-
-    See https://www.tensorflow.org/performance/benchmarks for details.
+    Build the graph with :class:`DistributedReplicatedBuilder` and train it.

    Note:
        Gradients are not averaged across workers, but applied to PS variables
@@ -154,7 +148,3 @@ class DistributedTrainerReplicated(Trainer):
                return _create_session()

        self.config.session_creator = _Creator()
-
-    @property
-    def vs_name_for_predictor(self):
-        return "tower0"
--- a/tensorpack/train/multigpu.py
+++ b/tensorpack/train/multigpu.py
@@ -47,11 +47,7 @@ def apply_prefetch_policy(config, gpu_prefetch=True):

 class SyncMultiGPUTrainerParameterServer(Trainer):
    """
-    A data-parallel multi-GPU trainer. It builds one tower on each GPU with
-    shared variable scope. It synchronoizes the gradients computed
-    from each tower, averages them and applies to the shared variables.
-
-    See https://www.tensorflow.org/performance/benchmarks for details.
+    Build graph with :class:`SyncMultiGPUParameterServerBuilder` and train it.
    """

    def __init__(self, config, ps_device='gpu', gpu_prefetch=True):
@@ -93,11 +89,7 @@ def SyncMultiGPUTrainer(config):

 class SyncMultiGPUTrainerReplicated(Trainer):
    """
-    Data-parallel multi-GPU trainer where each GPU contains a replicate of the whole model.
-    It will build one tower on each GPU under its own variable scope.
-    Each gradient update is averaged across or GPUs through NCCL.
-
-    See https://www.tensorflow.org/performance/benchmarks for details.
+    Build graph with :class:`SyncMultiGPUReplicatedBuilder` and train it.
    """
    def __init__(self, config, gpu_prefetch=True):
        """
@@ -126,11 +118,8 @@ class SyncMultiGPUTrainerReplicated(Trainer):

 class AsyncMultiGPUTrainer(Trainer):
    """
-    A data-parallel multi-GPU trainer. It builds one tower on each GPU with shared variable scope.
-    Every tower computes the gradients and independently applies them to the
-    variables, without synchronizing and averaging across towers.
+    Build graph with :class:`AsyncMultiGPUBuilder` and train it.
    """
-
    def __init__(self, config, scale_gradient=True):
        """
        Args: