Commit 8ec145ee authored by Yuxin Wu's avatar Yuxin Wu

update docs and debug the size of StagingInput

parent 5c25afcb
......@@ -135,6 +135,7 @@ class ILSVRC12Files(RNGDataFlow):
self.full_dir = os.path.join(dir, name)
self.name = name
assert os.path.isdir(self.full_dir), self.full_dir
assert meta_dir is None or os.path.isdir(meta_dir), meta_dir
if shuffle is None:
shuffle = name == 'train'
self.shuffle = shuffle
......
......@@ -165,10 +165,10 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
.. code-block:: none
# Start training like this:
(host1)$ train.py --job worker --task 0
(host1)$ CUDA_VISIBLE_DEVICES= train.py --job ps --task 0
(host2)$ train.py --job worker --task 1
(host2)$ CUDA_VISIBLE_DEVICES= train.py --job ps --task 1
(host1)$ ./train.py --job worker --task 0
(host1)$ CUDA_VISIBLE_DEVICES= ./train.py --job ps --task 0
(host2)$ ./train.py --job worker --task 1
(host2)$ CUDA_VISIBLE_DEVICES= ./train.py --job ps --task 1
"""
def __init__(self, towers, server):
......
......@@ -483,15 +483,14 @@ class StagingInput(FeedfreeInput):
A callback registered by this input source, to make sure stage/unstage
is run at each step.
"""
def __init__(self, stage_op_fn, unstage_op_fn, nr_stage):
def __init__(self, input, nr_stage):
self.nr_stage = nr_stage
self.stage_op_fn = stage_op_fn
self.unstage_op_fn = unstage_op_fn
self._input = input
self._initialized = False
def _setup_graph(self):
self.stage_op = self.stage_op_fn()
unstage_op = self.unstage_op_fn()
self.stage_op = self._input._get_stage_op()
unstage_op = self._input._get_unstage_op()
self.fetches = tf.train.SessionRunArgs(
fetches=[self.stage_op, unstage_op])
......@@ -523,6 +522,7 @@ class StagingInput(FeedfreeInput):
self._areas = []
self._stage_ops = []
self._unstage_ops = []
# self._size_ops = []
def _setup(self, inputs):
self._input.setup(inputs)
......@@ -530,10 +530,8 @@ class StagingInput(FeedfreeInput):
def _get_callbacks(self):
cbs = self._input.get_callbacks()
# Pass a lambda to be called later, because stage ops have not been built
cbs.append(
StagingInput.StagingCallback(
lambda: self._get_stage_op(), lambda: self._get_unstage_op(), self._nr_stage))
StagingInput.StagingCallback(self, self._nr_stage))
return cbs
def _size(self):
......@@ -560,6 +558,7 @@ class StagingInput(FeedfreeInput):
for vin, vout in zip(inputs, outputs):
vout.set_shape(vin.get_shape())
self._unstage_ops.append(outputs)
# self._size_ops.append(stage.size())
return outputs
def _get_stage_op(self):
......@@ -571,5 +570,17 @@ class StagingInput(FeedfreeInput):
all_outputs = list(chain.from_iterable(self._unstage_ops))
return tf.group(*all_outputs)
# for debugging only
def _create_ema_callback(self):
def create_ema_op():
with self.cached_name_scope():
avg_size = tf.truediv(tf.add_n(self._size_ops), len(self._size_ops), name='avg_stagingarea_size')
return add_moving_summary(avg_size, collection=None)[0].op
return RunOp(
create_ema_op,
run_before=False,
run_as_trigger=False,
run_step=True)
StagingInputWrapper = StagingInput
......@@ -215,7 +215,7 @@ def add_moving_summary(*args, **kwargs):
ctx = get_current_tower_context()
# allow ctx to be none
if ctx is not None and not ctx.is_main_training_tower:
return
return []
if not isinstance(args[0], list):
v = args
......
......@@ -283,12 +283,7 @@ class HorovodTrainer(SingleCostTrainer):
Note:
1. If using all GPUs, you can always skip the `CUDA_VISIBLE_DEVICES` option.
2. About performance, horovod is expected to be slightly
slower than native tensorflow on multi-GPU training, but faster in distributed training.
3. Due to the use of MPI, training is less informative (no progress bar).
It's recommended to use other multi-GPU trainers for single-node
experiments, and scale to multi nodes by horovod.
2. Due to the use of MPI, training is less informative (no progress bar).
"""
def __init__(self):
hvd.init()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment