Use StageArea by default in SyncMultiGPUTrainer. fix #140

fb43cf03 · Yuxin Wu · ba2758b3 · fb43cf03 · fb43cf03
Commit fb43cf03 authored May 06, 2017 by Yuxin Wu
Show whitespace changes
Inline Side-by-side

Showing with 10 additions and 8 deletions

README.md README.md +2 -3

tensorpack/train/multigpu.py tensorpack/train/multigpu.py +8 -5

No files found.
--- a/README.md
+++ b/README.md
@@ -41,10 +41,9 @@ It's Yet Another TF wrapper, but different in:
 3. Focus on training speed.
 	+	Tensorpack trainer is almost always faster than `feed_dict` based wrappers.
 	  Even on a small CNN example, the training runs [2x faster](https://gist.github.com/ppwwyyxx/8d95da79f8d97036a7d67c2416c851b6) than the equivalent Keras code.
-	  More improvements to come later.
-	+ Data-Parallel Multi-GPU training is off-the-shelf to use.
+	+ Data-Parallel Multi-GPU training is off-the-shelf to use. For <=4 GPUs it is as fast as [tensorflow/benchmarks](https://github.com/tensorflow/benchmarks).
-	You can also define your own trainer for different style of training (e.g. GAN) without losing the efficiency.
+	  More improvements to come later.
 4. Interface of extensible __Callbacks__.
 	Write a callback to implement everything you want to do apart from the training iterations, and

--- a/tensorpack/train/multigpu.py
+++ b/tensorpack/train/multigpu.py
@@ -17,7 +17,7 @@ from ..tfutils.gradproc import FilterNoneGrad, ScaleGradient
 from .base import Trainer
 from .feedfree import SingleCostFeedfreeTrainer
-from .input_data import QueueInput
+from .input_data import QueueInput, StagingInputWrapper
 __all__ = ['SyncMultiGPUTrainer', 'AsyncMultiGPUTrainer']
@@ -76,12 +76,16 @@ class SyncMultiGPUTrainer(MultiGPUTrainer,
        else:
            assert input_queue is None, input_queue
            self._input_method = config.data
-            # assert isinstance(self._input_method, QueueInput)
-        super(SyncMultiGPUTrainer, self).__init__(config)
        assert len(config.tower) >= 1, "MultiGPUTrainer must be used with at least one tower."
        if len(config.tower) > 1:
            assert tf.test.is_gpu_available()
+        if not isinstance(self._input_method, StagingInputWrapper):
+            devices = ['/gpu:{}'.format(k) for k in config.tower]
+            self._input_method = StagingInputWrapper(self._input_method, devices)
+        super(SyncMultiGPUTrainer, self).__init__(config)
        self.average_cost = average_cost
    @staticmethod
@@ -161,7 +165,6 @@ class AsyncMultiGPUTrainer(MultiGPUTrainer,
        else:
            assert input_queue is None, input_queue
            self._input_method = config.data
-            assert isinstance(self._input_method, QueueInput)
        super(AsyncMultiGPUTrainer, self).__init__(config)
        self._scale_gradient = scale_gradient
@@ -194,7 +197,7 @@ class AsyncMultiGPUTrainer(MultiGPUTrainer,
            train_op = self.config.optimizer.apply_gradients(grad_list[k])
            def f(op=train_op):  # avoid late-binding
-                self.sess.run([op])
+                self.sess.run([op])         # TODO this won't work with StageInput
                next(self.async_step_counter)   # atomic due to GIL
            th = LoopThread(f)
            th.name = "AsyncLoopThread-{}".format(k)