update docs

4bc0c748 · Yuxin Wu · 60f4c6df · 4bc0c748 · 4bc0c748 · 4bc0c748
Commit 4bc0c748 authored Mar 14, 2018 by Yuxin Wu
12 changed files
--- a/docs/tutorial/extend/callback.md
+++ b/docs/tutorial/extend/callback.md
@@ -10,7 +10,7 @@ def train(self):
  # start training:
  with sess.as_default():
    callbacks.before_train()
-    for epoch in range(epoch_start, epoch_end):
+    for epoch in range(starting_epoch, max_epoch + 1):
      callbacks.before_epoch()
      for step in range(steps_per_epoch):
        self.run_step()  # callbacks.{before,after}_run are hooked with session

--- a/examples/FasterRCNN/coco.py
+++ b/examples/FasterRCNN/coco.py
@@ -59,7 +59,8 @@ class COCODetection(object):
    def __init__(self, basedir, name):
        assert name in COCOMeta.INSTANCE_TO_BASEDIR.keys(), name
        self.name = name
-        self._imgdir = os.path.join(basedir, COCOMeta.INSTANCE_TO_BASEDIR[name])
+        self._imgdir = os.path.realpath(os.path.join(
+            basedir, COCOMeta.INSTANCE_TO_BASEDIR[name]))
        assert os.path.isdir(self._imgdir), self._imgdir
        annotation_file = os.path.join(
            basedir, 'annotations/instances_{}.json'.format(name))

--- a/examples/FasterRCNN/train.py
+++ b/examples/FasterRCNN/train.py
@@ -381,7 +381,9 @@ if __name__ == '__main__':
            model=Model(),
            data=QueueInput(get_train_dataflow(add_mask=config.MODE_MASK)),
            callbacks=[
+                PeriodicCallback(
                    ModelSaver(max_to_keep=10, keep_checkpoint_every_n_hours=1),
+                    every_k_epochs=20),
                # linear warmup
                ScheduledHyperParamSetter(
                    'learning_rate', warmup_schedule, interp='linear', step_based=True),

--- a/examples/GAN/GAN.py
+++ b/examples/GAN/GAN.py
@@ -16,8 +16,8 @@ from tensorpack.utils.argtools import memoized
 class GANModelDesc(ModelDescBase):
    def collect_variables(self, g_scope='gen', d_scope='discrim'):
        """
-        Assign self.g_vars to the parameters under scope `g_scope`,
+        Assign `self.g_vars` to the parameters under scope `g_scope`,
-        and same with self.d_vars.
+        and same with `self.d_vars`.
        """
        self.g_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, g_scope)
        assert self.g_vars
@@ -25,7 +25,10 @@ class GANModelDesc(ModelDescBase):
        assert self.d_vars
    def build_losses(self, logits_real, logits_fake):
-        """D and G play two-player minimax game with value function V(G,D)
+        """
+        Build standard GAN loss and set `self.g_loss` and `self.d_loss`.
+        D and G play two-player minimax game with value function V(G,D)
          min_G max _D V(D, G) = IE_{x ~ p_data} [log D(x)] + IE_{z ~ p_fake} [log (1 - D(G(z)))]
@@ -58,6 +61,13 @@ class GANModelDesc(ModelDescBase):
            add_moving_summary(self.g_loss, self.d_loss, d_accuracy, g_accuracy)
+    def _build_graph(self, inputs):
+        """
+        Have to build one tower and set the following attributes:
+        g_loss, d_loss, g_vars, d_vars.
+        """
+        pass
    @memoized
    def get_optimizer(self):
        return self._get_optimizer()
@@ -65,6 +75,11 @@ class GANModelDesc(ModelDescBase):
 class GANTrainer(TowerTrainer):
    def __init__(self, input, model):
+        """
+        Args:
+            input (InputSource):
+            model (GANModelDesc):
+        """
        super(GANTrainer, self).__init__()
        assert isinstance(model, GANModelDesc), model
        inputs_desc = model.get_inputs_desc()
@@ -149,6 +164,7 @@ class MultiGPUGANTrainer(TowerTrainer):
        def get_cost(*inputs):
            model.build_graph(*inputs)
            return [model.d_loss, model.g_loss]
        self.tower_func = TowerFuncWrapper(get_cost, model.get_inputs_desc())
        devices = [LeastLoadedDeviceSetter(d, raw_devices) for d in raw_devices]
        cost_list = DataParallelBuilder.build_on_towers(

--- a/examples/ResNet/load-resnet.py
+++ b/examples/ResNet/load-resnet.py
@@ -39,8 +39,8 @@ class Model(ModelDesc):
        bottleneck = functools.partial(resnet_bottleneck, stride_first=True)
        # tensorflow with padding=SAME will by default pad [2,3] here.
-        # but caffe conv with stride will pad [3,3]
+        # but caffe conv with stride will pad [3,2]
-        image = tf.pad(image, [[0, 0], [3, 3], [3, 3], [0, 0]])
+        image = tf.pad(image, [[0, 0], [3, 2], [3, 2], [0, 0]])
        image = tf.transpose(image, [0, 3, 1, 2])
        with argscope([Conv2D, MaxPooling, GlobalAvgPooling, BatchNorm],
                      data_format='channels_first'), \

--- a/tensorpack/callbacks/summary.py
+++ b/tensorpack/callbacks/summary.py
@@ -31,7 +31,8 @@ class MovingAverageSummary(Callback):
    def _setup_graph(self):
        ops = tf.get_collection(self._collection)
-        logger.info("Maintain moving average summary of {} tensors.".format(len(ops)))
+        logger.info("Maintain moving average summary of {} tensors in collection {}.".format(
+            len(ops), self._collection))
        self.ema_op = tf.group(*ops, name='maintain_moving_average_summary')
        self._fetch = tf.train.SessionRunArgs(fetches=self.ema_op)
@@ -47,7 +48,7 @@ class MergeAllSummaries_RunAlone(Callback):
    def _setup_graph(self):
        size = len(tf.get_collection(self._key))
-        logger.info("Summarizing collection '{}' of size {}".format(self._key, size))
+        logger.info("Summarizing collection '{}' of size {}.".format(self._key, size))
        self.summary_op = tf.summary.merge_all(self._key)
    def _trigger_step(self):
@@ -68,7 +69,7 @@ class MergeAllSummaries_RunWithOp(Callback):
    def _setup_graph(self):
        size = len(tf.get_collection(self._key))
-        logger.info("Summarizing collection '{}' of size {}".format(self._key, size))
+        logger.info("Summarizing collection '{}' of size {}.".format(self._key, size))
        self.summary_op = tf.summary.merge_all(self._key)
        if self.summary_op is not None:
            self._fetches = tf.train.SessionRunArgs(self.summary_op)

--- a/tensorpack/callbacks/trigger.py
+++ b/tensorpack/callbacks/trigger.py
@@ -162,8 +162,9 @@ class PeriodicCallback(EnableCallbackIf):
            callback (Callback): a Callback instance.
            every_k_steps (int): enable the callback when ``global_step % k == 0``. Set to
                None to ignore.
-            every_k_epochs (int): enable the callback when ``epoch_num % k == 0``. Set to
+            every_k_epochs (int): enable the callback when ``epoch_num % k == 0``.
-                None to ignore.
+                Also enable when the last step finishes (``epoch_num == max_epoch``
+                and ``local_step == steps_per_epoch - 1``). Set to None to ignore.
        every_k_steps and every_k_epochs can be both set, but cannot be both None.
        """
@@ -179,6 +180,10 @@ class PeriodicCallback(EnableCallbackIf):
            return True
        if self._epoch_k is not None and self.epoch_num % self._epoch_k == 0:
            return True
+        if self._epoch_k is not None:
+            if self.local_step == self.trainer.steps_per_epoch - 1 and \
+                    self.epoch_num == self.trainer.max_epoch:
+                return True
        return False
    def __str__(self):

--- a/tensorpack/graph_builder/model_desc.py
+++ b/tensorpack/graph_builder/model_desc.py
@@ -90,7 +90,7 @@ class ModelDescBase(object):
        """
        Build the whole symbolic graph.
        This is supposed to be the "tower function" when used with :class:`TowerTrainer`.
-        By default it will call :meth:`_build_graph` with a list of input tensors, for backward-compatibility.
+        By default it will call :meth:`_build_graph` with a list of input tensors.
        Args:
            args ([tf.Tensor]): tensors that matches the list of

--- a/tensorpack/tfutils/gradproc.py
+++ b/tensorpack/tfutils/gradproc.py
@@ -72,7 +72,7 @@ class FilterNoneGrad(GradientProcessor):
                g.append((grad, var))
        if self._verbose and len(to_print):
            message = ', '.join(to_print)
-            logger.warn("No gradient w.r.t these trainable variables: {}".format(message))
+            logger.warn("No gradient w.r.t {} trainable variables: {}".format(len(to_print), message))
        return g

--- a/tensorpack/tfutils/sessinit.py
+++ b/tensorpack/tfutils/sessinit.py
@@ -98,7 +98,7 @@ class SaverRestore(SessionInit):
        """
        Args:
            model_path (str): a model name (model-xxxx) or a ``checkpoint`` file.
-            prefix (str): during restore, add a ``prefix/`` for every variable in this checkpoint
+            prefix (str): during restore, add a ``prefix/`` for every variable in this checkpoint.
            ignore (list[str]): list of tensor names that should be ignored during loading, e.g. learning-rate
        """
        if model_path.endswith('.npy') or model_path.endswith('.npz'):
@@ -134,7 +134,7 @@ class SaverRestore(SessionInit):
        for v in graph_vars:
            name = get_savename_from_varname(v.name, varname_prefix=self.prefix)
            if name in self.ignore and reader.has_tensor(name):
-                logger.info("Variable {} in the graph will be not loaded from the checkpoint!".format(name))
+                logger.info("Variable {} in the graph will not be loaded from the checkpoint!".format(name))
            else:
                if reader.has_tensor(name):
                    func(reader, name, v)

--- a/tensorpack/tfutils/varmanip.py
+++ b/tensorpack/tfutils/varmanip.py
@@ -13,9 +13,7 @@ from ..utils import logger
 from .common import get_op_tensor_name
 __all__ = ['SessionUpdate', 'dump_session_params', 'dump_chkpt_vars',
-           'load_chkpt_vars',
+           'load_chkpt_vars', 'get_checkpoint_path']
-           # 'get_savename_from_varname', 'is_training_name',
-           'get_checkpoint_path']
 def get_savename_from_varname(

--- a/tensorpack/train/interface.py
+++ b/tensorpack/train/interface.py
@@ -48,28 +48,24 @@ def apply_default_prefetch(input_source_or_dataflow, trainer):
 def launch_train_with_config(config, trainer):
    """
    Train with a :class:`TrainConfig` and a :class:`Trainer`, to
-    mimic the old training interface. It basically does the following
+    present a simple training interface. It basically does the following
-    3 things (and you can easily do them by yourself):
+    3 things (and you can easily do them by yourself if you need more control):
-    1. Setup the :class:`InputSource` with automatic prefetching,
+    1. Setup the input with automatic prefetching,
-       for `config.data` or `config.dataflow`.
+       from `config.data` or `config.dataflow`.
-    2. Call `trainer.setup_graph` with the :class:`InputSource`,
+    2. Call `trainer.setup_graph` with the input as well as `config.model`.
-       as well as `config.model`.
    3. Call `trainer.train` with rest of the attributes of config.
    Args:
        config (TrainConfig):
-        trainer (Trainer): an instance of a SingleCostTrainer
+        trainer (Trainer): an instance of :class:`SingleCostTrainer`.
    Examples:
    .. code-block:: python
-        # With the old trainer:
-        SyncMultiGPUTrainerParameterServer(config, ps_device='gpu').train()
-        # With the current version of trainer:
        launch_train_with_config(
-            config, SyncMultiGPUTrainerParameterServer(towers, ps_device='gpu'))
+            config, SyncMultiGPUTrainerParameterServer(8, ps_device='gpu'))
    """
    assert isinstance(trainer, SingleCostTrainer), trainer
    assert isinstance(config, TrainConfig), config