doc and config.optimizer change

3f91978c · Yuxin Wu · 3b5ee108 · 3f91978c · 3f91978c · 3f91978c
Commit 3f91978c authored Feb 12, 2017 by Yuxin Wu
10 changed files
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -9,7 +9,7 @@ Here are a list of things that were changed, starting from an early version.
 TensorFlow itself also changes API and those are not listed here.
 * 2017/02/11. `_get_input_vars()` in `ModelDesc` was renamed to `_get_inputs`. `InputVar` was
-	renamed to `InputDesc`.
+	renamed to `InputDesc`. See [commit](https://github.com/ppwwyyxx/tensorpack/commit/5b29bda9f17d7b587259e13963c4c8093e8387f8).
 * 2017/01/27. `TrainConfig(step_per_epoch)` was renamed to `steps_per_epoch`. See [commit](https://github.com/ppwwyyxx/tensorpack/commit/a9dd0b8ec34209ab86a92875589dbbc4716e73ef).
 * 2017/01/25. Argument order of `models.ConcatWith` is changed to follow the API change in
 	TensorFlow upstream. See [commit](https://github.com/ppwwyyxx/tensorpack/commit/2df3dcf401a99fe61c699ad719e95528872d3abe).

--- a/examples/A3C-Gym/README.md
+++ b/examples/A3C-Gym/README.md
@@ -10,17 +10,19 @@ Most of them are the best reproducible results on gym.
 `CUDA_VISIBLE_DEVICES=0 ./train-atari.py --env Breakout-v0`
-It should run at a speed of 6~10 iteration/s on 1 GPU plus 12+ CPU cores.
+It should run at a speed of 6~10 iterations/s on 1 GPU plus 12+ CPU cores.
-Training with a significant slower speed (e.g. on CPU) will result in very bad score,
+In each iteration it trains on a batch of 128 new states.
-probably because of async issues.
 The pre-trained models are all trained with 4 GPUs for about 2 days.
-But note that multi-GPU doesn't give you obvious speedup here,
+But on simple games like Breakout, you can get good performance within several hours.
-because the bottleneck in this implementation is not computation but data. On machines without huge memory, you may also need to
+Also note that multi-GPU doesn't give you obvious speedup here,
-enable tcmalloc to keep training throughput more stable.
+because the bottleneck in this implementation is not computation but data.
-Occasionally, processes may not get terminated completely, therefore it is suggested to use `systemd-run` to run any
+Some practicical notes:
+1. On machines without huge memory, enabling tcmalloc may keep training throughput more stable.
+2. Occasionally, processes may not get terminated completely. It is suggested to use `systemd-run` to run any
 multiprocess Python program to get a cgroup dedicated for the task.
+3. Training with a significant slower speed (e.g. on CPU) will result in very bad score, probably because of async issues.
 ### To run a pretrained Atari model for 100 episodes:

--- a/tensorpack/callbacks/param.py
+++ b/tensorpack/callbacks/param.py
@@ -109,7 +109,7 @@ class ObjAttrParam(HyperParam):
 class HyperParamSetter(Triggerable):
    """
-    An abstract base callback to set hyperparameters in every epoch.
+    An abstract base callback to set hyperparameters.
    """
    def __init__(self, param):
@@ -218,7 +218,7 @@ class ScheduledHyperParamSetter(HyperParamSetter):
            param: same as in :class:`HyperParamSetter`.
            schedule (list): with the format ``[(epoch1, val1), (epoch2, val2), (epoch3, val3)]``.
                Each ``(ep, val)`` pair means to set the param
-                to "val" after the `ep` th epoch.
+                to "val" after the completion of `ep` th epoch.
                If ep == 0, the value will be set before training.
            interp: None: no interpolation. 'linear': linear interpolation

--- a/tensorpack/models/batch_norm.py
+++ b/tensorpack/models/batch_norm.py
@@ -180,6 +180,8 @@ def BatchNorm(x, use_local_stat=None, decay=0.9, epsilon=1e-5):
    # TODO for other towers, maybe can make it depend some op later
    # TODO update it later (similar to slim) might be faster?
+    # TODO main tower already has too many work, would it be faster to update
+    # it only on the last tower?
    if ctx.is_main_training_tower:
        with tf.control_dependencies([update_op1, update_op2]):
            return tf.identity(xn, name='output')

--- a/tensorpack/models/model_desc.py
+++ b/tensorpack/models/model_desc.py
@@ -105,7 +105,8 @@ class ModelDesc(object):
    def _get_input_vars(self):  # keep backward compatibility
        raise NotImplementedError()
-# graph, cost, optimizer:
+# build graph:
    def build_graph(self, model_inputs):
        """
        Build the whole symbolic graph.
@@ -120,6 +121,7 @@ class ModelDesc(object):
    def _build_graph(self, inputs):
        pass
+# set cost. Only for single-cost model.
    def get_cost(self):
        """
        Return the cost tensor in the graph. Called by some of the :class:`tensorpack.train.Trainer` which
@@ -155,6 +157,8 @@ class ModelDesc(object):
    def _get_cost(self, *args):
        return self.cost
+# set optimizer. only for single-optimizer model.
    def get_optimizer(self):
        """
        Returns:
@@ -180,7 +184,7 @@ class ModelFromMetaGraph(ModelDesc):
    Only useful for inference.
    """
-    # TODO can this be really used for inference?
+    # TODO this class may not be functional anymore.
    def __init__(self, filename):
        """

--- a/tensorpack/train/config.py
+++ b/tensorpack/train/config.py
@@ -130,10 +130,10 @@ class TrainConfig(object):
            self.predict_tower = [self.predict_tower]
        if 'optimizer' in kwargs:
-            self.optimizer = kwargs.pop('optimizer')
+            self._optimizer = kwargs.pop('optimizer')
-            assert_type(self.optimizer, tf.train.Optimizer)
+            assert_type(self._optimizer, tf.train.Optimizer)
        else:
-            self.optimizer = None
+            self._optimizer = None
        assert len(kwargs) == 0, 'Unknown arguments: {}'.format(str(kwargs.keys()))
@@ -157,3 +157,12 @@ class TrainConfig(object):
    @nr_tower.setter
    def nr_tower(self, value):
        self.tower = list(range(value))
+    @property
+    def optimizer(self):
+        """ for back-compatibilty only. will remove in the future"""
+        if self._optimizer:
+            return self._optimizer
+        opt = self.model.get_optimizer()
+        self._optimizer = opt
+        return opt
--- a/tensorpack/train/feedfree.py
+++ b/tensorpack/train/feedfree.py
@@ -39,15 +39,12 @@ class FeedfreeTrainerBase(Trainer):
 class SingleCostFeedfreeTrainer(FeedfreeTrainerBase):
    """ A feedfree Trainer which assumes a single cost. """
    def _get_cost_and_grad(self):
-        """ get the cost and gradient on a new tower"""
+        """ get the cost and gradient"""
        actual_inputs = self._get_input_tensors()
        self.model.build_graph(actual_inputs)
        cost_var = self.model.get_cost()
-        # GATE_NONE faster?
        opt = self.config.optimizer
-        if opt is None:
+        # GATE_NONE faster?
-            opt = self.model.get_optimizer()  # XXX TODO not gonna work if optimizer modifies grad
-            self.config.optimizer = opt
        grads = opt.compute_gradients(
            cost_var,
            gate_gradients=tf.train.Optimizer.GATE_NONE,

--- a/tensorpack/train/input_data.py
+++ b/tensorpack/train/input_data.py
@@ -152,11 +152,6 @@ class QueueInput(FeedfreeInput):
        assert len(ret) == len(self.input_placehdrs)
        for qv, v in zip(ret, self.input_placehdrs):
            qv.set_shape(v.get_shape())
-        # test the overhead of queue
-        # ret = [tf.Variable(tf.random_normal([64,224,224,3],
-        # dtype=tf.float32), trainable=False),
-        # tf.Variable(tf.ones([64], dtype=tf.int32), trainable=False)]
        return ret
@@ -225,9 +220,14 @@ class BatchQueueInput(FeedfreeInput):
 class DummyConstantInput(FeedfreeInput):
-    """ Input some constant tensor. Only for debugging performance issues """
+    """ Input with some random tensor placed on GPU.
+        Useful for debugging performance issues """
    def __init__(self, shapes):
+        """
+        Args:
+            shapes (list[list]): a list of fully-sepcified shapes.
+        """
        self.shapes = shapes
        logger.warn("Using dummy input for debug!")
@@ -236,11 +236,9 @@ class DummyConstantInput(FeedfreeInput):
        assert len(self.shapes) == len(placehdrs)
        ret = []
        for idx, p in enumerate(placehdrs):
-            with tf.device('/gpu:0'):
+            ret.append(tf.get_variable(
-                ret.append(tf.get_variable(
+                'dummy-' + p.op.name, shape=self.shapes[idx],
-                    'dummy-' + p.op.name, shape=self.shapes[idx],
+                dtype=p.dtype, trainable=False))
-                    dtype=p.dtype, trainable=False,
-                    initializer=tf.constant_initializer()))
        return ret

--- a/tensorpack/train/multigpu.py
+++ b/tensorpack/train/multigpu.py
@@ -151,9 +151,6 @@ class SyncMultiGPUTrainer(MultiGPUTrainer,
                               name='averaged_cost')
            opt = self.config.optimizer
-            if opt is None:
-                opt = self.model.get_optimizer()
-                self.config.optimizer = opt
            grads = opt.compute_gradients(
                cost,
                gate_gradients=tf.train.Optimizer.GATE_NONE,

--- a/tensorpack/train/trainer.py
+++ b/tensorpack/train/trainer.py
@@ -85,8 +85,6 @@ class SimpleTrainer(Trainer):
            cost_var = model.get_cost()
        opt = self.config.optimizer
-        if not opt:
-            opt = model.get_optimizer()
        grads = opt.compute_gradients(cost_var)
        grads = apply_grad_processors(grads,
                                      self.model.get_gradient_processor())