misc updates

456f5675 · Yuxin Wu · 2238ca93 · 456f5675 · 456f5675 · 456f5675
Commit 456f5675 authored Jun 17, 2018 by Yuxin Wu
8 changed files
--- a/examples/DeepQNetwork/README.md
+++ b/examples/DeepQNetwork/README.md
@@ -20,7 +20,7 @@ Claimed performance in the paper can be reproduced, on several games I've tested

 ![DQN](curve-breakout.png)

-On one TitanX, Double-DQN took 1 day of training to reach a score of 400 on breakout game.
+On one TitanX, Double-DQN took 1 day of training to reach a score of 400 on breakout.
 Batch-A3C implementation only took <2 hours.

 Double-DQN with nature paper setting runs at 60 batches (3840 trained frames, 240 seen frames, 960 game frames) per second on (Maxwell) TitanX.
@@ -29,11 +29,9 @@ Double-DQN with nature paper setting runs at 60 batches (3840 trained frames, 24

 Install [ALE](https://github.com/mgbellemare/Arcade-Learning-Environment) and gym.

-Download an [atari rom](https://github.com/openai/atari-py/tree/master/atari_py/atari_roms) to
-`$TENSORPACK_DATASET/atari_rom/` (defaults to ~/tensorpack_data/atari_rom/), e.g.:
+Download an [atari rom](https://github.com/openai/atari-py/tree/master/atari_py/atari_roms), e.g.:
 ```
-mkdir -p ~/tensorpack_data/atari_rom
-wget https://github.com/openai/atari-py/raw/master/atari_py/atari_roms/breakout.bin -O ~/tensorpack_data/atari_rom/breakout.bin
+wget https://github.com/openai/atari-py/raw/master/atari_py/atari_roms/breakout.bin
 ```

 Start Training:
@@ -44,8 +42,9 @@ Start Training:

 Watch the agent play:
 ```
-./DQN.py --rom breakout.bin --task play --load path/to/model
+# Download pretrained models or use one you trained:
+wget http://models.tensorpack.com/DeepQNetwork/DoubleDQN-Breakout.npz
+./DQN.py --rom breakout.bin --task play --load DoubleDQN-Breakout.npz
 ```
-A pretrained model on breakout can be downloaded [here](http://models.tensorpack.com/DeepQNetwork/).

 A3C code and models for Atari games in OpenAI Gym are released in [examples/A3C-Gym](../A3C-Gym)
--- a/examples/DeepQNetwork/atari.py
+++ b/examples/DeepQNetwork/atari.py
@@ -83,7 +83,6 @@ class AtariPlayer(gym.Env):
            self.viz = viz
            if self.viz and isinstance(self.viz, float):
                self.windowname = os.path.basename(rom_file)
-                cv2.startWindowThread()
                cv2.namedWindow(self.windowname)

            self.ale.loadROM(rom_file.encode('utf-8'))

--- a/tensorpack/callbacks/base.py
+++ b/tensorpack/callbacks/base.py
@@ -250,6 +250,7 @@ class ProxyCallback(Callback):
            cb(Callback): the underlying callback
        """
        assert isinstance(cb, Callback), type(cb)
+        self.chief_only = cb.chief_only
        self.cb = cb

    def _before_train(self):

--- a/tensorpack/callbacks/prof.py
+++ b/tensorpack/callbacks/prof.py
@@ -25,6 +25,8 @@ class GPUUtilizationTracker(Callback):
    It will start a process to run `nvidia-smi` every second
    within the epoch (the trigger_epoch time was not included),
    and write average utilization to monitors.
+
+    This callback creates a process, therefore it cannot be used with MPI.
    """

    _chief_only = False
@@ -51,6 +53,7 @@ class GPUUtilizationTracker(Callback):
        assert len(self._devices), "[GPUUtilizationTracker] No GPU device given!"

    def _before_train(self):
+        assert 'OMPI_COMM_WORLD_SIZE' not in os.environ, "GPUUtilizationTracker cannot be used under MPI!"
        self._evt = mp.Event()
        self._stop_evt = mp.Event()
        self._queue = mp.Queue()

--- a/tensorpack/dataflow/image.py
+++ b/tensorpack/dataflow/image.py
@@ -71,6 +71,7 @@ class ImageFromFile(RNGDataFlow):
            self.rng.shuffle(self.files)
        for f in self.files:
            im = cv2.imread(f, self.imread_mode)
+            assert im is not None, f
            if self.channel == 3:
                im = im[:, :, ::-1]
            if self.resize is not None:

--- a/tensorpack/tfutils/argscope.py
+++ b/tensorpack/tfutils/argscope.py
@@ -6,8 +6,9 @@ from collections import defaultdict
 import copy
 from functools import wraps
 from inspect import isfunction, getmembers
+
+from .tower import get_current_tower_context
 from ..utils import logger
-import tensorflow as tf

 __all__ = ['argscope', 'get_arg_scope', 'enable_argscope_for_module']

@@ -74,18 +75,14 @@ def argscope_mapper(func, log_shape=True):
        actual_args = copy.copy(get_arg_scope()[func.__name__])
        actual_args.update(kwargs)
        out_tensor = func(*args, **actual_args)
-
-        scope_name = tf.get_variable_scope().name
-        is_tower_scope = 'tower' in scope_name
-
        in_tensor = args[0]
+
+        ctx = get_current_tower_context()
        name = '<unkown>' if 'name' not in kwargs else kwargs['name']
        if log_shape:
-            if is_tower_scope:
-                if 'tower0' in scope_name:
-                    logger.info('%20s: %20s -> %20s' % (name, in_tensor.shape.as_list(), out_tensor.shape.as_list()))
-            else:
-                logger.info('%20s: %20s -> %20s' % (name, in_tensor.shape.as_list(), out_tensor.shape.as_list()))
+            if ('tower' not in ctx.ns_name.lower()) or ctx.is_main_training_tower:
+                logger.info('%20s: %20s -> %20s' %
+                            (name, in_tensor.shape.as_list(), out_tensor.shape.as_list()))

        return out_tensor
    # argscope requires this property
@@ -98,6 +95,9 @@ def enable_argscope_for_module(module, log_shape=True):
    Overwrite all functions of a given module to support argscope.
    Note that this function monkey-patches the module and therefore could have unexpected consequences.
    It has been only tested to work well with `tf.layers` module.
+
+    Args:
+        log_shape (bool): print input/output shapes of each function when called.
    """
    for name, obj in getmembers(module):
        if isfunction(obj):

--- a/tensorpack/tfutils/tower.py
+++ b/tensorpack/tfutils/tower.py
@@ -20,7 +20,7 @@ _CurrentTowerContext = None

 class TowerContext(object):
    """ A context where the current model is built in.
-        Since TF1.8, TensorFlow starts to introduce the same concept.
+        Since TF 1.8, TensorFlow starts to introduce the same concept.
    """

    def __init__(self, tower_name, is_training, index=0, vs_name=''):

--- a/tensorpack/train/trainers.py
+++ b/tensorpack/train/trainers.py
@@ -297,42 +297,40 @@ class HorovodTrainer(SingleCostTrainer):
        # change trainer to HorovodTrainer(), then
        /path/to/mpirun -np 8 -H server1:4,server2:4  \\
            -bind-to none -map-by slot \\
-            --output-filename mylog  -x LD_LIBRARY_PATH -x CUDA_VISIBLE_DEVICES=0,1,2,3 \\
+            --output-filename mylog  -x LD_LIBRARY_PATH \\
            python train.py
-        # (Add other environment variables you need by -x, e.g. PYTHONPATH, PATH)
+        # Add other environment variables you need by -x, e.g. PYTHONPATH, PATH.
+        # If using all GPUs, you can always skip the `CUDA_VISIBLE_DEVICES` option.
+        # There are other MPI options that can potentially improve performance especially on special hardwares.

    Note:
-        1. If using all GPUs, you can always skip the `CUDA_VISIBLE_DEVICES` option.
-
-        2. Due to the use of MPI, training is less informative (no progress bar).
-
-        3. Due to a TF bug, you must not initialize CUDA context before training.
+        1. Due to a TF bug, you must not initialize CUDA context before training.
           Therefore TF functions like `is_gpu_available()` or `list_local_devices()`
           must be avoided.

-        4. MPI does not like fork(). If your dataflow contains multiprocessing, it may cause problems.
+        2. MPI does not like fork(). If your dataflow contains multiprocessing, it may cause problems.

-        3. MPI sometimes fails to kill all processes. Be sure to check it.
+        3. MPI sometimes fails to kill all processes. Be sure to check it afterwards.

-        5. Keep in mind that there is one process per GPU, therefore:
+        4. Keep in mind that there is one process per GPU, therefore:

           + If your data processing is heavy, doing it in a separate dedicated process might be
             a better choice than doing them repeatedly in each process.

-           + Your need to set log directory carefully to avoid conflicts.
-             For example you can set it only for the chief process.
+           + You need to make sure log directories in each process won't conflict.
+             You can set it only for the chief process, or set a different one for each process.

-           + Callbacks have an option to be run only on the chief process, or on all processes.
+           + Callbacks have an option to be run only in the chief process, or in all processes.
             See :meth:`callback.set_chief_only()`. Most callbacks have a reasonable
             default already, but certain callbacks may not behave properly by default. Report an issue if you find any.

           + You can use Horovod API such as `hvd.rank()` to know which process you are.
             Chief process has rank 0.

-        6. Due to these caveats, see
+        5. Due to these caveats, see
           `ResNet-Horovod <https://github.com/tensorpack/benchmarks/tree/master/ResNet-Horovod>`_
           for a full example which has handled these common issues.
-           The example can train ImageNet in roughly an hour following the paper's setup.
+           This example can train ImageNet in roughly an hour following the paper's setup.
    """
    def __init__(self, average=True):
        """