Commit 3f91978c authored by Yuxin Wu's avatar Yuxin Wu

doc and config.optimizer change

parent 3b5ee108
......@@ -9,7 +9,7 @@ Here are a list of things that were changed, starting from an early version.
TensorFlow itself also changes API and those are not listed here.
* 2017/02/11. `_get_input_vars()` in `ModelDesc` was renamed to `_get_inputs`. `InputVar` was
renamed to `InputDesc`.
renamed to `InputDesc`. See [commit](https://github.com/ppwwyyxx/tensorpack/commit/5b29bda9f17d7b587259e13963c4c8093e8387f8).
* 2017/01/27. `TrainConfig(step_per_epoch)` was renamed to `steps_per_epoch`. See [commit](https://github.com/ppwwyyxx/tensorpack/commit/a9dd0b8ec34209ab86a92875589dbbc4716e73ef).
* 2017/01/25. Argument order of `models.ConcatWith` is changed to follow the API change in
TensorFlow upstream. See [commit](https://github.com/ppwwyyxx/tensorpack/commit/2df3dcf401a99fe61c699ad719e95528872d3abe).
......
......@@ -10,17 +10,19 @@ Most of them are the best reproducible results on gym.
`CUDA_VISIBLE_DEVICES=0 ./train-atari.py --env Breakout-v0`
It should run at a speed of 6~10 iteration/s on 1 GPU plus 12+ CPU cores.
Training with a significant slower speed (e.g. on CPU) will result in very bad score,
probably because of async issues.
It should run at a speed of 6~10 iterations/s on 1 GPU plus 12+ CPU cores.
In each iteration it trains on a batch of 128 new states.
The pre-trained models are all trained with 4 GPUs for about 2 days.
But note that multi-GPU doesn't give you obvious speedup here,
because the bottleneck in this implementation is not computation but data. On machines without huge memory, you may also need to
enable tcmalloc to keep training throughput more stable.
But on simple games like Breakout, you can get good performance within several hours.
Also note that multi-GPU doesn't give you obvious speedup here,
because the bottleneck in this implementation is not computation but data.
Occasionally, processes may not get terminated completely, therefore it is suggested to use `systemd-run` to run any
Some practicical notes:
1. On machines without huge memory, enabling tcmalloc may keep training throughput more stable.
2. Occasionally, processes may not get terminated completely. It is suggested to use `systemd-run` to run any
multiprocess Python program to get a cgroup dedicated for the task.
3. Training with a significant slower speed (e.g. on CPU) will result in very bad score, probably because of async issues.
### To run a pretrained Atari model for 100 episodes:
......
......@@ -109,7 +109,7 @@ class ObjAttrParam(HyperParam):
class HyperParamSetter(Triggerable):
"""
An abstract base callback to set hyperparameters in every epoch.
An abstract base callback to set hyperparameters.
"""
def __init__(self, param):
......@@ -218,7 +218,7 @@ class ScheduledHyperParamSetter(HyperParamSetter):
param: same as in :class:`HyperParamSetter`.
schedule (list): with the format ``[(epoch1, val1), (epoch2, val2), (epoch3, val3)]``.
Each ``(ep, val)`` pair means to set the param
to "val" after the `ep` th epoch.
to "val" after the completion of `ep` th epoch.
If ep == 0, the value will be set before training.
interp: None: no interpolation. 'linear': linear interpolation
......
......@@ -180,6 +180,8 @@ def BatchNorm(x, use_local_stat=None, decay=0.9, epsilon=1e-5):
# TODO for other towers, maybe can make it depend some op later
# TODO update it later (similar to slim) might be faster?
# TODO main tower already has too many work, would it be faster to update
# it only on the last tower?
if ctx.is_main_training_tower:
with tf.control_dependencies([update_op1, update_op2]):
return tf.identity(xn, name='output')
......
......@@ -105,7 +105,8 @@ class ModelDesc(object):
def _get_input_vars(self): # keep backward compatibility
raise NotImplementedError()
# graph, cost, optimizer:
# build graph:
def build_graph(self, model_inputs):
"""
Build the whole symbolic graph.
......@@ -120,6 +121,7 @@ class ModelDesc(object):
def _build_graph(self, inputs):
pass
# set cost. Only for single-cost model.
def get_cost(self):
"""
Return the cost tensor in the graph. Called by some of the :class:`tensorpack.train.Trainer` which
......@@ -155,6 +157,8 @@ class ModelDesc(object):
def _get_cost(self, *args):
return self.cost
# set optimizer. only for single-optimizer model.
def get_optimizer(self):
"""
Returns:
......@@ -180,7 +184,7 @@ class ModelFromMetaGraph(ModelDesc):
Only useful for inference.
"""
# TODO can this be really used for inference?
# TODO this class may not be functional anymore.
def __init__(self, filename):
"""
......
......@@ -130,10 +130,10 @@ class TrainConfig(object):
self.predict_tower = [self.predict_tower]
if 'optimizer' in kwargs:
self.optimizer = kwargs.pop('optimizer')
assert_type(self.optimizer, tf.train.Optimizer)
self._optimizer = kwargs.pop('optimizer')
assert_type(self._optimizer, tf.train.Optimizer)
else:
self.optimizer = None
self._optimizer = None
assert len(kwargs) == 0, 'Unknown arguments: {}'.format(str(kwargs.keys()))
......@@ -157,3 +157,12 @@ class TrainConfig(object):
@nr_tower.setter
def nr_tower(self, value):
self.tower = list(range(value))
@property
def optimizer(self):
""" for back-compatibilty only. will remove in the future"""
if self._optimizer:
return self._optimizer
opt = self.model.get_optimizer()
self._optimizer = opt
return opt
......@@ -39,15 +39,12 @@ class FeedfreeTrainerBase(Trainer):
class SingleCostFeedfreeTrainer(FeedfreeTrainerBase):
""" A feedfree Trainer which assumes a single cost. """
def _get_cost_and_grad(self):
""" get the cost and gradient on a new tower"""
""" get the cost and gradient"""
actual_inputs = self._get_input_tensors()
self.model.build_graph(actual_inputs)
cost_var = self.model.get_cost()
# GATE_NONE faster?
opt = self.config.optimizer
if opt is None:
opt = self.model.get_optimizer() # XXX TODO not gonna work if optimizer modifies grad
self.config.optimizer = opt
# GATE_NONE faster?
grads = opt.compute_gradients(
cost_var,
gate_gradients=tf.train.Optimizer.GATE_NONE,
......
......@@ -152,11 +152,6 @@ class QueueInput(FeedfreeInput):
assert len(ret) == len(self.input_placehdrs)
for qv, v in zip(ret, self.input_placehdrs):
qv.set_shape(v.get_shape())
# test the overhead of queue
# ret = [tf.Variable(tf.random_normal([64,224,224,3],
# dtype=tf.float32), trainable=False),
# tf.Variable(tf.ones([64], dtype=tf.int32), trainable=False)]
return ret
......@@ -225,9 +220,14 @@ class BatchQueueInput(FeedfreeInput):
class DummyConstantInput(FeedfreeInput):
""" Input some constant tensor. Only for debugging performance issues """
""" Input with some random tensor placed on GPU.
Useful for debugging performance issues """
def __init__(self, shapes):
"""
Args:
shapes (list[list]): a list of fully-sepcified shapes.
"""
self.shapes = shapes
logger.warn("Using dummy input for debug!")
......@@ -236,11 +236,9 @@ class DummyConstantInput(FeedfreeInput):
assert len(self.shapes) == len(placehdrs)
ret = []
for idx, p in enumerate(placehdrs):
with tf.device('/gpu:0'):
ret.append(tf.get_variable(
'dummy-' + p.op.name, shape=self.shapes[idx],
dtype=p.dtype, trainable=False,
initializer=tf.constant_initializer()))
dtype=p.dtype, trainable=False))
return ret
......
......@@ -151,9 +151,6 @@ class SyncMultiGPUTrainer(MultiGPUTrainer,
name='averaged_cost')
opt = self.config.optimizer
if opt is None:
opt = self.model.get_optimizer()
self.config.optimizer = opt
grads = opt.compute_gradients(
cost,
gate_gradients=tf.train.Optimizer.GATE_NONE,
......
......@@ -85,8 +85,6 @@ class SimpleTrainer(Trainer):
cost_var = model.get_cost()
opt = self.config.optimizer
if not opt:
opt = model.get_optimizer()
grads = opt.compute_gradients(cost_var)
grads = apply_grad_processors(grads,
self.model.get_gradient_processor())
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment