update docs and debug the size of StagingInput

8ec145ee · Yuxin Wu · 5c25afcb · 8ec145ee · 8ec145ee · 8ec145ee
Commit 8ec145ee authored Dec 18, 2017 by Yuxin Wu
5 changed files
--- a/tensorpack/dataflow/dataset/ilsvrc.py
+++ b/tensorpack/dataflow/dataset/ilsvrc.py
@@ -135,6 +135,7 @@ class ILSVRC12Files(RNGDataFlow):
        self.full_dir = os.path.join(dir, name)
        self.name = name
        assert os.path.isdir(self.full_dir), self.full_dir
+        assert meta_dir is None or os.path.isdir(meta_dir), meta_dir
        if shuffle is None:
            shuffle = name == 'train'
        self.shuffle = shuffle

--- a/tensorpack/graph_builder/distributed.py
+++ b/tensorpack/graph_builder/distributed.py
@@ -165,10 +165,10 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
        .. code-block:: none
            # Start training like this:
-            (host1)$ train.py --job worker --task 0
+            (host1)$ ./train.py --job worker --task 0
-            (host1)$ CUDA_VISIBLE_DEVICES= train.py --job ps --task 0
+            (host1)$ CUDA_VISIBLE_DEVICES= ./train.py --job ps --task 0
-            (host2)$ train.py --job worker --task 1
+            (host2)$ ./train.py --job worker --task 1
-            (host2)$ CUDA_VISIBLE_DEVICES= train.py --job ps --task 1
+            (host2)$ CUDA_VISIBLE_DEVICES= ./train.py --job ps --task 1
    """
    def __init__(self, towers, server):

--- a/tensorpack/input_source/input_source.py
+++ b/tensorpack/input_source/input_source.py
@@ -483,15 +483,14 @@ class StagingInput(FeedfreeInput):
        A callback registered by this input source, to make sure stage/unstage
        is run at each step.
        """
-        def __init__(self, stage_op_fn, unstage_op_fn, nr_stage):
+        def __init__(self, input, nr_stage):
            self.nr_stage = nr_stage
-            self.stage_op_fn = stage_op_fn
+            self._input = input
-            self.unstage_op_fn = unstage_op_fn
            self._initialized = False
        def _setup_graph(self):
-            self.stage_op = self.stage_op_fn()
+            self.stage_op = self._input._get_stage_op()
-            unstage_op = self.unstage_op_fn()
+            unstage_op = self._input._get_unstage_op()
            self.fetches = tf.train.SessionRunArgs(
                fetches=[self.stage_op, unstage_op])
@@ -523,6 +522,7 @@ class StagingInput(FeedfreeInput):
        self._areas = []
        self._stage_ops = []
        self._unstage_ops = []
+        # self._size_ops = []
    def _setup(self, inputs):
        self._input.setup(inputs)
@@ -530,10 +530,8 @@ class StagingInput(FeedfreeInput):
    def _get_callbacks(self):
        cbs = self._input.get_callbacks()
-        # Pass a lambda to be called later, because stage ops have not been built
        cbs.append(
-            StagingInput.StagingCallback(
+            StagingInput.StagingCallback(self, self._nr_stage))
-                lambda: self._get_stage_op(), lambda: self._get_unstage_op(), self._nr_stage))
        return cbs
    def _size(self):
@@ -560,6 +558,7 @@ class StagingInput(FeedfreeInput):
            for vin, vout in zip(inputs, outputs):
                vout.set_shape(vin.get_shape())
            self._unstage_ops.append(outputs)
+            # self._size_ops.append(stage.size())
            return outputs
    def _get_stage_op(self):
@@ -571,5 +570,17 @@ class StagingInput(FeedfreeInput):
            all_outputs = list(chain.from_iterable(self._unstage_ops))
            return tf.group(*all_outputs)
+    # for debugging only
+    def _create_ema_callback(self):
+        def create_ema_op():
+            with self.cached_name_scope():
+                avg_size = tf.truediv(tf.add_n(self._size_ops), len(self._size_ops), name='avg_stagingarea_size')
+                return add_moving_summary(avg_size, collection=None)[0].op
+        return RunOp(
+            create_ema_op,
+            run_before=False,
+            run_as_trigger=False,
+            run_step=True)
 StagingInputWrapper = StagingInput
--- a/tensorpack/tfutils/summary.py
+++ b/tensorpack/tfutils/summary.py
@@ -215,7 +215,7 @@ def add_moving_summary(*args, **kwargs):
    ctx = get_current_tower_context()
    # allow ctx to be none
    if ctx is not None and not ctx.is_main_training_tower:
-        return
+        return []
    if not isinstance(args[0], list):
        v = args

--- a/tensorpack/train/trainers.py
+++ b/tensorpack/train/trainers.py
@@ -283,12 +283,7 @@ class HorovodTrainer(SingleCostTrainer):
    Note:
        1. If using all GPUs, you can always skip the `CUDA_VISIBLE_DEVICES` option.
-        2. About performance, horovod is expected to be slightly
+        2. Due to the use of MPI, training is less informative (no progress bar).
-           slower than native tensorflow on multi-GPU training, but faster in distributed training.
-        3. Due to the use of MPI, training is less informative (no progress bar).
-           It's recommended to use other multi-GPU trainers for single-node
-           experiments, and scale to multi nodes by horovod.
    """
    def __init__(self):
        hvd.init()