update docs

801e2921 · Yuxin Wu · 07783edb · 801e2921 · 801e2921 · 801e2921
Commit 801e2921 authored Jun 29, 2018 by Yuxin Wu
21 changed files
--- a/tensorpack/callbacks/graph.py
+++ b/tensorpack/callbacks/graph.py
@@ -34,7 +34,7 @@ class RunOp(Callback):
            run_step (bool): run the Op every step (along with training)
            verbose (bool): print logs when the op is run.
-        Examples:
+        Example:
            The `DQN Example
            <https://github.com/tensorpack/tensorpack/blob/master/examples/DeepQNetwork/>`_
            uses this callback to update target network.
@@ -105,7 +105,7 @@ class ProcessTensors(Callback):
    to the session.
    You can use it to print tensors, save tensors to file, etc.
-    Examples:
+    Example:
    .. code-block:: python

--- a/tensorpack/callbacks/inference_runner.py
+++ b/tensorpack/callbacks/inference_runner.py
@@ -57,12 +57,10 @@ class InferenceRunnerBase(Callback):
    """ Base class for inference runner.
    Note:
+        1. InferenceRunner will use `input.size()` to determine
-    1. InferenceRunner will use `input.size()` to determine
+           how much iterations to run, so you're responsible to ensure that
-        how much iterations to run, so you're responsible to ensure that
+           `size()` is reasonable.
-        `size()` is reasonable.
+        2. Only works with instances of `TowerTrainer`.
-    2. Only works with instances of `TowerTrainer`.
    """
    def __init__(self, input, infs):
        """

--- a/tensorpack/callbacks/saver.py
+++ b/tensorpack/callbacks/saver.py
@@ -108,7 +108,7 @@ class MinSaver(Callback):
                MinSaver('val-error')
-        Notes:
+        Note:
            It assumes that :class:`ModelSaver` is used with the same ``checkpoint_dir``
            and appears earlier in the callback list.
            The default for both :class:`ModelSaver` and :class:`MinSaver`

--- a/tensorpack/dataflow/common.py
+++ b/tensorpack/dataflow/common.py
@@ -208,7 +208,7 @@ class FixedSizeData(ProxyDataFlow):
                next call will continue the previous iteration over ``ds``,
                instead of reinitializing an iterator.
-        Examples:
+        Example:
        .. code-block:: none
@@ -476,7 +476,7 @@ class JoinData(DataFlow):
    """
    Join the components from each DataFlow.
-    Examples:
+    Example:
    .. code-block:: none

--- a/tensorpack/dataflow/dataset/ilsvrc.py
+++ b/tensorpack/dataflow/dataset/ilsvrc.py
@@ -195,7 +195,7 @@ class ILSVRC12(ILSVRC12Files):
                By default, it tries to automatically detect the structure.
                You probably do not need to care about this option because 'original' is what people usually have.
-        Examples:
+        Example:
        When `dir_structure=='original'`, `dir` should have the following structure:

--- a/tensorpack/dataflow/parallel.py
+++ b/tensorpack/dataflow/parallel.py
@@ -220,12 +220,12 @@ class PrefetchDataZMQ(_MultiProcessZMQDataFlow):
           As a result, we have the following guarantee on the dataflow correctness:
           a. When ``nr_proc=1``, this dataflow produces the same data as the
-                given dataflow in the same order.
+              given dataflow in the same order.
           b. When ``nr_proc>1``, if each sample from the given dataflow is i.i.d. (e.g. fully shuffled),
-                then this dataflow produces the **same distribution** of data as the given dataflow.
+              then this dataflow produces the **same distribution** of data as the given dataflow.
-                This implies that there will be duplication, reordering, etc.
+              This implies that there will be duplication, reordering, etc.
-                You probably only want to use it for training.
+              You probably only want to use it for training.
-                If the samples are not i.i.d., the behavior is undefined.
+              If the samples are not i.i.d., the behavior is undefined.
        2. `reset_state()` of the given dataflow will be called **once and only once** in the worker processes.
        3. The fork of processes happened in this dataflow's `reset_state()` method.
           Please note that forking a TensorFlow GPU session may be unsafe.

--- a/tensorpack/graph_builder/distributed.py
+++ b/tensorpack/graph_builder/distributed.py
@@ -78,7 +78,7 @@ class DistributedParameterServerBuilder(DataParallelBuilder, DistributedBuilderB
    Note:
        1. Gradients are not averaged across workers, but applied to PS variables
-        directly (either with or without locking depending on the optimizer).
+           directly (either with or without locking depending on the optimizer).
    """
    def __init__(self, towers, server, caching_device):
@@ -150,8 +150,7 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
    Note:
        1. Gradients are not averaged across workers, but applied to PS variables
-        directly (either with or without locking depending on the optimizer).
+           directly (either with or without locking depending on the optimizer).
        2. Some details about collections: all variables created inside tower
           will become local variables,
           and a clone will be made in global variables for all trainable/model variables.

--- a/tensorpack/graph_builder/training.py
+++ b/tensorpack/graph_builder/training.py
@@ -128,9 +128,6 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder):
    It is an equivalent of ``--variable_update=parameter_server`` in
    `tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`_.
-    Attribute:
-        grads: list of (g, v). Averaged gradients, available after build()
    """
    def __init__(self, towers, ps_device):
        """
@@ -144,6 +141,8 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder):
    def build(self, get_grad_fn, get_opt_fn):
        """
+        Build the graph, and set self.grads to a list of (g, v), containing the averaged gradients.
        Args:
            get_grad_fn (-> [(grad, var)]):
            get_opt_fn (-> tf.train.Optimizer): callable which returns an optimizer
@@ -187,10 +186,6 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
    It is an equivalent of ``--variable_update=replicated`` in
    `tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`_.
-    Attribute:
-        grads: #GPU number of lists of (g, v). Synchronized gradients on each device,
-        available after build() Though on different devices, they should contain the same value.
    """
    def __init__(self, towers, average, mode):
@@ -201,6 +196,9 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
    def build(self, get_grad_fn, get_opt_fn):
        """
+        Build the graph, and set self.grads to #GPU number of lists of (g, v), containing the
+        all-reduced gradients on each device.
        Args:
            get_grad_fn (-> [(grad, var)]):
            get_opt_fn (-> tf.train.Optimizer): callable which returns an optimizer

--- a/tensorpack/input_source/input_source_base.py
+++ b/tensorpack/input_source/input_source_base.py
@@ -210,7 +210,7 @@ def remap_input_source(input, names):
    Returns:
        InputSource:
-    Examples:
+    Example:
    .. code-block:: python

--- a/tensorpack/models/batch_norm.py
+++ b/tensorpack/models/batch_norm.py
@@ -88,34 +88,35 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
    Args:
        internal_update (bool): if False, add EMA update ops to
-            `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer
+          `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies.
-            by control dependencies.
+          They are very similar in speed, but `internal_update=True` can be used
-            They are very similar in speed, but `internal_update=True` can be used
+          when you have conditionals in your model, or when you have multiple networks to train.
-            when you have conditionals in your model, or when you have multiple networks to train.
+          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699
        sync_statistics: either None or "nccl". By default (None), it uses statistics of the input tensor to normalize.
-            When set to "nccl", this layer must be used under tensorpack multi-gpu trainers,
+          When set to "nccl", this layer must be used under tensorpack multi-gpu trainers,
-            and it then uses per-machine (multiple GPU) statistics to normalize.
+          and it then uses per-machine (multiple GPU) statistics to normalize.
-            This option has no effect when not training.
+          This option has no effect when not training.
-            The option is also known as "Cross-GPU BatchNorm" as mentioned in https://arxiv.org/abs/1711.07240.
+          The option is also known as "Cross-GPU BatchNorm" as mentioned in https://arxiv.org/abs/1711.07240.
+          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222
    Variable Names:
    * ``beta``: the bias term. Will be zero-inited by default.
-    * ``gamma``: the scale term. Will be one-inited by default. Input will be transformed by ``x * gamma + beta``.
+    * ``gamma``: the scale term. Will be one-inited by default.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.
    Note:
-        1. Combinations of ``training`` and ``ctx.is_training``:
+        Combinations of ``training`` and ``ctx.is_training``:
-            * ``training == ctx.is_training``: standard BN, EMA are
-                maintained during training and used during inference. This is
+        * ``training == ctx.is_training``: standard BN, EMA are maintained during training
-                the default.
+          and used during inference. This is the default.
-            * ``training and not ctx.is_training``: still use batch statistics in inference.
+        * ``training and not ctx.is_training``: still use batch statistics in inference.
-            * ``not training and ctx.is_training``: use EMA to normalize in
+        * ``not training and ctx.is_training``: use EMA to normalize in
-                training. This is useful when you load a pre-trained BN and
+          training. This is useful when you load a pre-trained BN and
-                don't want to fine tune the EMA. EMA will not be updated in
+          don't want to fine tune the EMA. EMA will not be updated in
-                this case.
+          this case.
    """
    # parse shapes
    data_format = get_data_format(data_format, tfmode=False)
@@ -238,12 +239,16 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
        if new_shape is not None:
            batch_mean = tf.reshape(batch_mean, new_shape)
            batch_var = tf.reshape(batch_var, new_shape)
-            r_gamma = tf.reshape(gamma, new_shape)
+            # Using fused_batch_norm(is_training=False) is actually slightly faster,
-            r_beta = tf.reshape(beta, new_shape)
+            # but hopefully this call will be JITed in the future.
+            xn = tf.nn.batch_normalization(
+                inputs, batch_mean, batch_var,
+                tf.reshape(beta, new_shape),
+                tf.reshape(gamma, new_shape), epsilon)
        else:
-            r_gamma, r_beta = gamma, beta
+            xn = tf.nn.batch_normalization(
-        xn = tf.nn.batch_normalization(
+                inputs, batch_mean, batch_var,
-            inputs, batch_mean, batch_var, r_beta, r_gamma, epsilon)
+                beta, gamma, epsilon)
        if ctx.is_main_training_tower:
            ret = update_bn_ema(

--- a/tensorpack/models/registry.py
+++ b/tensorpack/models/registry.py
@@ -65,7 +65,7 @@ def layer_register(
    Returns:
        A decorator used to register a layer.
-    Examples:
+    Example:
    .. code-block:: python

--- a/tensorpack/models/tflayer.py
+++ b/tensorpack/models/tflayer.py
@@ -94,7 +94,7 @@ def rename_tflayer_get_variable():
    Returns:
        A context where the variables are renamed.
-    Examples:
+    Example:
    .. code-block:: python

--- a/tensorpack/predict/base.py
+++ b/tensorpack/predict/base.py
@@ -32,7 +32,7 @@ class PredictorBase(object):
        """
        Call the predictor on some inputs.
-        Examples:
+        Example:
            When you have a predictor defined with two inputs, call it with:
            .. code-block:: python

--- a/tensorpack/tfutils/scope_utils.py
+++ b/tensorpack/tfutils/scope_utils.py
@@ -17,7 +17,7 @@ def auto_reuse_variable_scope(func):
    A decorator which automatically reuses the current variable scope if the
    function has been called with the same variable scope before.
-    Examples:
+    Example:
    .. code-block:: python
@@ -60,7 +60,7 @@ def under_name_scope(name=None):
        A decorator which makes the function happen under a name scope.
        The default name is the function itself.
-    Examples:
+    Example:
    .. code-block:: python
@@ -92,7 +92,7 @@ def under_variable_scope():
        A decorator which makes the function happen under a variable scope,
        which is named by the function itself.
-    Examples:
+    Example:
    .. code-block:: python

--- a/tensorpack/tfutils/summary.py
+++ b/tensorpack/tfutils/summary.py
@@ -107,7 +107,7 @@ def add_tensor_summary(x, types, name=None, collections=None,
            set to True, calling this function under other TowerContext
            has no effect.
-    Examples:
+    Example:
    .. code-block:: python
@@ -170,7 +170,7 @@ def add_param_summary(*summary_lists, **kwargs):
            Summary type is defined in :func:`add_tensor_summary`.
        collections (list[str]): collections of the summary ops.
-    Examples:
+    Example:
    .. code-block:: python

--- a/tensorpack/tfutils/tower.py
+++ b/tensorpack/tfutils/tower.py
@@ -204,7 +204,7 @@ def TowerContext(tower_name, is_training, vs_name=''):
    Returns:
        A context within which the tower function should be called.
-    Examples:
+    Example:
    .. code-block:: python

--- a/tensorpack/train/config.py
+++ b/tensorpack/train/config.py
@@ -196,7 +196,7 @@ class AutoResumeTrainConfig(TrainConfig):
                Otherwise, resume will take priority.
            kwargs: same as in :class:`TrainConfig`.
-        Notes:
+        Note:
            The main goal of this class is to let a training job to resume
            without changing any line of code or command line arguments.
            So it's useful to let resume take priority over user-provided arguments sometimes:

--- a/tensorpack/train/interface.py
+++ b/tensorpack/train/interface.py
@@ -59,7 +59,7 @@ def launch_train_with_config(config, trainer):
        config (TrainConfig):
        trainer (Trainer): an instance of :class:`SingleCostTrainer`.
-    Examples:
+    Example:
    .. code-block:: python

--- a/tensorpack/train/tower.py
+++ b/tensorpack/train/tower.py
@@ -79,7 +79,7 @@ class TowerTrainer(Trainer):
        Returns:
            an :class:`OnlinePredictor`.
-        Examples:
+        Example:
        .. code-block:: none
@@ -160,8 +160,7 @@ class SingleCostTrainer(TowerTrainer):
        Note:
            `get_cost_fn` will be the tower function.
-            It must follows the
+            It must follows the `rules of tower function.
-            `rules of tower function.
            <http://tensorpack.readthedocs.io/en/latest/tutorial/trainer.html#tower-trainer>`_.
        """
        get_cost_fn = TowerFuncWrapper(get_cost_fn, inputs_desc)

--- a/tensorpack/utils/globvars.py
+++ b/tensorpack/utils/globvars.py
@@ -40,7 +40,7 @@ globalns = GlobalNS()
 """
 A namespace to store global variables.
-Examples:
+Example:
 .. code-block:: none

--- a/tensorpack/utils/utils.py
+++ b/tensorpack/utils/utils.py
@@ -31,7 +31,7 @@ def humanize_time_delta(sec):
    Returns:
        str - time difference as a readable string
-    Examples:
+    Example:
    .. code-block:: python
@@ -96,7 +96,7 @@ def fix_rng_seed(seed):
    Note:
        See https://github.com/tensorpack/tensorpack/issues/196.
-    Examples:
+    Example:
        Fix random seed in both tensorpack and tensorflow.