Commit 456f5675 authored by Yuxin Wu's avatar Yuxin Wu

misc updates

parent 2238ca93
......@@ -20,7 +20,7 @@ Claimed performance in the paper can be reproduced, on several games I've tested
![DQN](curve-breakout.png)
On one TitanX, Double-DQN took 1 day of training to reach a score of 400 on breakout game.
On one TitanX, Double-DQN took 1 day of training to reach a score of 400 on breakout.
Batch-A3C implementation only took <2 hours.
Double-DQN with nature paper setting runs at 60 batches (3840 trained frames, 240 seen frames, 960 game frames) per second on (Maxwell) TitanX.
......@@ -29,11 +29,9 @@ Double-DQN with nature paper setting runs at 60 batches (3840 trained frames, 24
Install [ALE](https://github.com/mgbellemare/Arcade-Learning-Environment) and gym.
Download an [atari rom](https://github.com/openai/atari-py/tree/master/atari_py/atari_roms) to
`$TENSORPACK_DATASET/atari_rom/` (defaults to ~/tensorpack_data/atari_rom/), e.g.:
Download an [atari rom](https://github.com/openai/atari-py/tree/master/atari_py/atari_roms), e.g.:
```
mkdir -p ~/tensorpack_data/atari_rom
wget https://github.com/openai/atari-py/raw/master/atari_py/atari_roms/breakout.bin -O ~/tensorpack_data/atari_rom/breakout.bin
wget https://github.com/openai/atari-py/raw/master/atari_py/atari_roms/breakout.bin
```
Start Training:
......@@ -44,8 +42,9 @@ Start Training:
Watch the agent play:
```
./DQN.py --rom breakout.bin --task play --load path/to/model
# Download pretrained models or use one you trained:
wget http://models.tensorpack.com/DeepQNetwork/DoubleDQN-Breakout.npz
./DQN.py --rom breakout.bin --task play --load DoubleDQN-Breakout.npz
```
A pretrained model on breakout can be downloaded [here](http://models.tensorpack.com/DeepQNetwork/).
A3C code and models for Atari games in OpenAI Gym are released in [examples/A3C-Gym](../A3C-Gym)
......@@ -83,7 +83,6 @@ class AtariPlayer(gym.Env):
self.viz = viz
if self.viz and isinstance(self.viz, float):
self.windowname = os.path.basename(rom_file)
cv2.startWindowThread()
cv2.namedWindow(self.windowname)
self.ale.loadROM(rom_file.encode('utf-8'))
......
......@@ -250,6 +250,7 @@ class ProxyCallback(Callback):
cb(Callback): the underlying callback
"""
assert isinstance(cb, Callback), type(cb)
self.chief_only = cb.chief_only
self.cb = cb
def _before_train(self):
......
......@@ -25,6 +25,8 @@ class GPUUtilizationTracker(Callback):
It will start a process to run `nvidia-smi` every second
within the epoch (the trigger_epoch time was not included),
and write average utilization to monitors.
This callback creates a process, therefore it cannot be used with MPI.
"""
_chief_only = False
......@@ -51,6 +53,7 @@ class GPUUtilizationTracker(Callback):
assert len(self._devices), "[GPUUtilizationTracker] No GPU device given!"
def _before_train(self):
assert 'OMPI_COMM_WORLD_SIZE' not in os.environ, "GPUUtilizationTracker cannot be used under MPI!"
self._evt = mp.Event()
self._stop_evt = mp.Event()
self._queue = mp.Queue()
......
......@@ -71,6 +71,7 @@ class ImageFromFile(RNGDataFlow):
self.rng.shuffle(self.files)
for f in self.files:
im = cv2.imread(f, self.imread_mode)
assert im is not None, f
if self.channel == 3:
im = im[:, :, ::-1]
if self.resize is not None:
......
......@@ -6,8 +6,9 @@ from collections import defaultdict
import copy
from functools import wraps
from inspect import isfunction, getmembers
from .tower import get_current_tower_context
from ..utils import logger
import tensorflow as tf
__all__ = ['argscope', 'get_arg_scope', 'enable_argscope_for_module']
......@@ -74,18 +75,14 @@ def argscope_mapper(func, log_shape=True):
actual_args = copy.copy(get_arg_scope()[func.__name__])
actual_args.update(kwargs)
out_tensor = func(*args, **actual_args)
scope_name = tf.get_variable_scope().name
is_tower_scope = 'tower' in scope_name
in_tensor = args[0]
ctx = get_current_tower_context()
name = '<unkown>' if 'name' not in kwargs else kwargs['name']
if log_shape:
if is_tower_scope:
if 'tower0' in scope_name:
logger.info('%20s: %20s -> %20s' % (name, in_tensor.shape.as_list(), out_tensor.shape.as_list()))
else:
logger.info('%20s: %20s -> %20s' % (name, in_tensor.shape.as_list(), out_tensor.shape.as_list()))
if ('tower' not in ctx.ns_name.lower()) or ctx.is_main_training_tower:
logger.info('%20s: %20s -> %20s' %
(name, in_tensor.shape.as_list(), out_tensor.shape.as_list()))
return out_tensor
# argscope requires this property
......@@ -98,6 +95,9 @@ def enable_argscope_for_module(module, log_shape=True):
Overwrite all functions of a given module to support argscope.
Note that this function monkey-patches the module and therefore could have unexpected consequences.
It has been only tested to work well with `tf.layers` module.
Args:
log_shape (bool): print input/output shapes of each function when called.
"""
for name, obj in getmembers(module):
if isfunction(obj):
......
......@@ -20,7 +20,7 @@ _CurrentTowerContext = None
class TowerContext(object):
""" A context where the current model is built in.
Since TF1.8, TensorFlow starts to introduce the same concept.
Since TF 1.8, TensorFlow starts to introduce the same concept.
"""
def __init__(self, tower_name, is_training, index=0, vs_name=''):
......
......@@ -297,42 +297,40 @@ class HorovodTrainer(SingleCostTrainer):
# change trainer to HorovodTrainer(), then
/path/to/mpirun -np 8 -H server1:4,server2:4 \\
-bind-to none -map-by slot \\
--output-filename mylog -x LD_LIBRARY_PATH -x CUDA_VISIBLE_DEVICES=0,1,2,3 \\
--output-filename mylog -x LD_LIBRARY_PATH \\
python train.py
# (Add other environment variables you need by -x, e.g. PYTHONPATH, PATH)
# Add other environment variables you need by -x, e.g. PYTHONPATH, PATH.
# If using all GPUs, you can always skip the `CUDA_VISIBLE_DEVICES` option.
# There are other MPI options that can potentially improve performance especially on special hardwares.
Note:
1. If using all GPUs, you can always skip the `CUDA_VISIBLE_DEVICES` option.
2. Due to the use of MPI, training is less informative (no progress bar).
3. Due to a TF bug, you must not initialize CUDA context before training.
1. Due to a TF bug, you must not initialize CUDA context before training.
Therefore TF functions like `is_gpu_available()` or `list_local_devices()`
must be avoided.
4. MPI does not like fork(). If your dataflow contains multiprocessing, it may cause problems.
2. MPI does not like fork(). If your dataflow contains multiprocessing, it may cause problems.
3. MPI sometimes fails to kill all processes. Be sure to check it.
3. MPI sometimes fails to kill all processes. Be sure to check it afterwards.
5. Keep in mind that there is one process per GPU, therefore:
4. Keep in mind that there is one process per GPU, therefore:
+ If your data processing is heavy, doing it in a separate dedicated process might be
a better choice than doing them repeatedly in each process.
+ Your need to set log directory carefully to avoid conflicts.
For example you can set it only for the chief process.
+ You need to make sure log directories in each process won't conflict.
You can set it only for the chief process, or set a different one for each process.
+ Callbacks have an option to be run only on the chief process, or on all processes.
+ Callbacks have an option to be run only in the chief process, or in all processes.
See :meth:`callback.set_chief_only()`. Most callbacks have a reasonable
default already, but certain callbacks may not behave properly by default. Report an issue if you find any.
+ You can use Horovod API such as `hvd.rank()` to know which process you are.
Chief process has rank 0.
6. Due to these caveats, see
5. Due to these caveats, see
`ResNet-Horovod <https://github.com/tensorpack/benchmarks/tree/master/ResNet-Horovod>`_
for a full example which has handled these common issues.
The example can train ImageNet in roughly an hour following the paper's setup.
This example can train ImageNet in roughly an hour following the paper's setup.
"""
def __init__(self, average=True):
"""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment