update docs

5c25afcb · Yuxin Wu · 4692e325 · 5c25afcb · 5c25afcb
Commit 5c25afcb authored Dec 18, 2017 by Yuxin Wu
Hide whitespace changes
Inline Side-by-side

Showing with 30 additions and 6 deletions

tensorpack/graph_builder/distributed.py tensorpack/graph_builder/distributed.py +28 -5

tensorpack/train/trainers.py tensorpack/train/trainers.py +2 -1

No files found.
--- a/tensorpack/graph_builder/distributed.py
+++ b/tensorpack/graph_builder/distributed.py
@@ -62,8 +62,28 @@ class DistributedBuilderBase(GraphBuilder):
 class DistributedParameterServerBuilder(DataParallelBuilder, DistributedBuilderBase):
+    """
+    Distributed parameter server training.
+    A single copy of parameters are scattered around PS.
+    Gradients across GPUs are averaged within the worker, and applied to PS.
+    Each worker also caches the variables for reading.
+    It is an equivalent of ``--variable_update=parameter_server`` in
+    `tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`_.
+    Note:
+        1. Gradients are not averaged across workers, but applied to PS variables
+        directly (either with or without locking depending on the optimizer).
+    """
    def __init__(self, towers, server, caching_device):
+        """
+        Args:
+            towers (list[int]): list of GPU ids.
+            server (tf.train.Server): the server with ps and workers.
+                job_name must be 'worker'.
+            caching_device (str): either 'cpu' or 'gpu'
+        """
        DataParallelBuilder.__init__(self, towers)
        DistributedBuilderBase.__init__(self, server)
@@ -120,9 +140,13 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
    `tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`_.
    Note:
-        Gradients are not averaged across workers, but applied to PS variables
+        1. Gradients are not averaged across workers, but applied to PS variables
        directly (either with or without locking depending on the optimizer).
+        2. Some details about collections: all variables created inside tower
+           will become local variables,
+           and a clone will be made in global variables for all trainable/model variables.
    Example:
        .. code-block:: python
@@ -142,9 +166,9 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
            # Start training like this:
            (host1)$ train.py --job worker --task 0
-            (host1)$ train.py --job ps --task 0
+            (host1)$ CUDA_VISIBLE_DEVICES= train.py --job ps --task 0
            (host2)$ train.py --job worker --task 1
-            (host2)$ train.py --job ps --task 1
+            (host2)$ CUDA_VISIBLE_DEVICES= train.py --job ps --task 1
    """
    def __init__(self, towers, server):
@@ -152,8 +176,7 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
        Args:
            towers (list[int]): list of GPU ids.
            server (tf.train.Server): the server with ps and workers.
-                The job_name must be 'worker' because 'ps' job doesn't need to
+                job_name must be 'worker'.
-                build any graph.
        """
        DataParallelBuilder.__init__(self, towers)
        DistributedBuilderBase.__init__(self, server)

--- a/tensorpack/train/trainers.py
+++ b/tensorpack/train/trainers.py
@@ -160,7 +160,6 @@ class SyncMultiGPUTrainerReplicated(SingleCostTrainer):
 class DistributedTrainerBase(SingleCostTrainer):
    devices = None
-    # TODO use full device name instead of id
    def __init__(self, gpus, server):
        super(DistributedTrainerBase, self).__init__()
@@ -195,6 +194,8 @@ class DistributedTrainerParameterServer(DistributedTrainerBase):
        """
        Args:
            gpus ([int]): list of GPU ids.
+            server (tf.train.Server): the server with ps and workers.
+            caching_device (str): either 'cpu' or 'gpu'. The device to cache variables copied from PS
        """
        super(DistributedTrainerParameterServer, self).__init__(gpus, server)
        assert self.job_name in ['ps', 'worker'], self.job_name