use ps_device=cpu when #gpu>=4

8df2d7c5 · Yuxin Wu · 8419ee3f · 8df2d7c5 · 8df2d7c5 · 8df2d7c5
Commit 8df2d7c5 authored Aug 04, 2017 by Yuxin Wu
Showing with 20 additions and 9 deletions

examples/DeepQNetwork/DQN.py examples/DeepQNetwork/DQN.py +2 -5

tensorpack/train/distributed.py tensorpack/train/distributed.py +1 -0

tensorpack/train/multigpu.py tensorpack/train/multigpu.py +17 -4

No files found.
--- a/examples/DeepQNetwork/DQN.py
+++ b/examples/DeepQNetwork/DQN.py
@@ -96,7 +96,6 @@ class Model(DQNModel):


 def get_config():
-    M = Model()
    expreplay = ExpReplay(
        predictor_io_names=(['state'], ['Qvalue']),
        player=get_player(train=True),
@@ -111,6 +110,7 @@ def get_config():

    return TrainConfig(
        dataflow=expreplay,
+        model=Model(),
        callbacks=[
            ModelSaver(),
            PeriodicTrigger(
@@ -128,11 +128,8 @@ def get_config():
                every_k_epochs=10),
            HumanHyperParamSetter('learning_rate'),
        ],
-        model=M,
        steps_per_epoch=STEPS_PER_EPOCH,
        max_epoch=1000,
-        # run the simulator on a separate GPU if available
-        predict_tower=[1] if get_nr_gpu() > 1 else [0],
    )


@@ -172,5 +169,5 @@ if __name__ == '__main__':
                os.path.basename(ROM_FILE).split('.')[0])))
        config = get_config()
        if args.load:
-            config.session_init = SaverRestore(args.load)
+            config.session_init = get_model_loader(args.load)
        QueueInputTrainer(config).train()
--- a/tensorpack/train/distributed.py
+++ b/tensorpack/train/distributed.py
@@ -42,6 +42,7 @@ class DistributedTrainerReplicated(MultiGPUTrainerBase):
    and get synchronously applied to the global copy of variables located on PS.
    Then each worker copy the latest variables from PS back to local.

+    See https://www.tensorflow.org/performance/benchmarks for details.

    Note:
        Gradients are not averaged across workers.

--- a/tensorpack/train/multigpu.py
+++ b/tensorpack/train/multigpu.py
@@ -153,18 +153,23 @@ class SyncMultiGPUTrainerParameterServer(MultiGPUTrainerBase):
    A data-parallel multi-GPU trainer. It builds one tower on each GPU with
    shared variable scope. It synchronoizes the gradients computed
    from each tower, averages them and applies to the shared variables.
+
+    See https://www.tensorflow.org/performance/benchmarks for details.
    """

-    def __init__(self, config, ps_device='gpu', gpu_prefetch=True):
+    def __init__(self, config, ps_device=None, gpu_prefetch=True):
        """
        Args:
            config(TrainConfig): Must contain 'model' and either one of 'data' or 'dataflow'.
-            ps_device: either 'gpu' or 'cpu', where variables are stored.
+            ps_device: either 'gpu' or 'cpu', where variables are stored. Setting to 'cpu' might help if #gpu>=4
+                Defaults to 'cpu' when #gpu >= 4.
            gpu_prefetch(bool): whether to prefetch the data to each GPU. Usually improve performance.
        """
        apply_prefetch_policy(config, gpu_prefetch)
        self._input_source = config.data

+        if ps_device is None:
+            ps_device = 'cpu' if config.nr_tower >= 4 else 'gpu'
        assert ps_device in ['gpu', 'cpu'], ps_device
        self._ps_device = ps_device
        super(SyncMultiGPUTrainerParameterServer, self).__init__(config)
@@ -248,6 +253,8 @@ class SyncMultiGPUTrainerReplicated(MultiGPUTrainerBase):
    Data-parallel multi-GPU trainer where each GPU contains a replicate of the whole model.
    It will build one tower on each GPU under its own variable scope.
    Each gradient update is averaged across or GPUs through NCCL.
+
+    See https://www.tensorflow.org/performance/benchmarks for details.
    """
    def __init__(self, config, gpu_prefetch=True):
        """
@@ -381,8 +388,14 @@ class AsyncMultiGPUTrainer(MultiGPUTrainerBase):
        """
        callbacks = input.setup(model.get_inputs_desc())

-        raw_devices = ['/gpu:{}'.format(k) for k in tower]
-        devices = [LeastLoadedDeviceSetter(d, raw_devices) for d in raw_devices]
+        ps_device = 'cpu' if len(tower) >= 4 else 'gpu'
+
+        if ps_device == 'gpu':
+            raw_devices = ['/gpu:{}'.format(k) for k in tower]
+            devices = [LeastLoadedDeviceSetter(d, raw_devices) for d in raw_devices]
+        else:
+            devices = [tf.train.replica_device_setter(
+                worker_device=d, ps_device='/cpu:0', ps_tasks=1) for d in raw_devices]
        grad_list = MultiGPUTrainerBase.build_on_multi_tower(
            tower,
            lambda: MultiGPUTrainerBase._build_graph_get_grads(model, input), devices)