some docs update & add optimizer with gradproc

076a728f · Yuxin Wu · 3f91978c · 076a728f · 076a728f · 076a728f
Commit 076a728f authored Feb 12, 2017 by Yuxin Wu
19 changed files
--- a/examples/A3C-Gym/README.md
+++ b/examples/A3C-Gym/README.md
@@ -19,6 +19,7 @@ Also note that multi-GPU doesn't give you obvious speedup here,
 because the bottleneck in this implementation is not computation but data.

 Some practicical notes:
+
 1. On machines without huge memory, enabling tcmalloc may keep training throughput more stable.
 2. Occasionally, processes may not get terminated completely. It is suggested to use `systemd-run` to run any
 multiprocess Python program to get a cgroup dedicated for the task.

--- a/examples/A3C-Gym/train-atari.py
+++ b/examples/A3C-Gym/train-atari.py
@@ -133,8 +133,8 @@ class Model(ModelDesc):
                                   value_loss, pred_reward, advantage, self.cost)

    def get_gradient_processor(self):
-        return [MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1)),
-                SummaryGradient()]
+        return [gradproc.MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1)),
+                gradproc.SummaryGradient()]


 class MySimulatorMaster(SimulatorMaster, Callback):

--- a/examples/CTC-TIMIT/train-timit.py
+++ b/examples/CTC-TIMIT/train-timit.py
@@ -74,7 +74,7 @@ class Model(ModelDesc):
        summary.add_moving_summary(err, self.cost)

    def get_gradient_processor(self):
-        return [GlobalNormClip(5), SummaryGradient()]
+        return [gradproc.GlobalNormClip(5), gradproc.SummaryGradient()]


 def get_data(path, isTrain, stat_file):

--- a/examples/Char-RNN/char-rnn.py
+++ b/examples/Char-RNN/char-rnn.py
@@ -106,7 +106,7 @@ class Model(ModelDesc):
        summary.add_moving_summary(self.cost)

    def get_gradient_processor(self):
-        return [GlobalNormClip(5)]
+        return [gradproc.GlobalNormClip(5)]


 def get_config():

--- a/examples/DeepQNetwork/DQN.py
+++ b/examples/DeepQNetwork/DQN.py
@@ -150,8 +150,8 @@ class Model(ModelDesc):
        return tf.group(*ops, name='update_target_network')

    def get_gradient_processor(self):
-        return [MapGradient(lambda grad: tf.clip_by_global_norm([grad], 5)[0][0]),
-                SummaryGradient()]
+        return [gradproc.GlobalNormalClip(10),
+                gradproc.SummaryGradient()]


 def get_config():

--- a/examples/GAN/InfoGAN-mnist.py
+++ b/examples/GAN/InfoGAN-mnist.py
@@ -96,17 +96,13 @@ class Model(GANModelDesc):
                fake_sample_viz = tf.cast((fake_sample + 1) * 128.0, tf.uint8, name='viz')
                tf.summary.image('gen', fake_sample_viz, max_outputs=30)

-            # TODO investigate how bn stats should be updated across two discrim
+            # may need to investigate how bn stats should be updated across two discrim
            with tf.variable_scope('discrim'):
                real_pred, _ = self.discriminator(real_sample)

            with tf.variable_scope('discrim', reuse=True):
                fake_pred, dist_param = self.discriminator(fake_sample)

-        # post-process output vector from discriminator to become valid
-        # distribution parameters
-        encoder_activation = self.factors.encoder_activation(dist_param)
-
        """
        Mutual information between x (i.e. zc in this case) and some
        information s (the generated samples in this case):
@@ -130,6 +126,8 @@ class Model(GANModelDesc):
            # Adding this term may make the curve less stable because the
            # entropy estimated from the samples is not the true value.

+            # post-process output vector from discriminator to obtain valid distribution parameters
+            encoder_activation = self.factors.encoder_activation(dist_param)
            cond_ents = self.factors.entropy(zc, encoder_activation)
            cond_entropy = tf.add_n(cond_ents, name="total_conditional_entropy")

@@ -139,7 +137,7 @@ class Model(GANModelDesc):
        # default GAN objective
        self.build_losses(real_pred, fake_pred)

-        # subtract mutual information for latent factores (we want to maximize them)
+        # subtract mutual information for latent factors (we want to maximize them)
        self.g_loss = tf.subtract(self.g_loss, MI, name='total_g_loss')
        self.d_loss = tf.subtract(self.d_loss, MI, name='total_d_loss')

@@ -150,7 +148,7 @@ class Model(GANModelDesc):

    def get_gradient_processor_g(self):
        # generator learns 5 times faster
-        return [CheckGradient(), ScaleGradient(('.*', 5), log=False)]
+        return [gradproc.ScaleGradient(('.*', 5), log=False)]


 def get_data():

--- a/examples/HED/hed.py
+++ b/examples/HED/hed.py
@@ -93,7 +93,8 @@ class Model(ModelDesc):
            add_moving_summary(costs + [wrong, self.cost])

    def get_gradient_processor(self):
-        return [ScaleGradient([('convfcweight.*', 0.1), ('conv5_.*', 5)])]
+        return [gradproc.ScaleGradient([
+            ('convfcweight.*', 0.1), ('conv5_.*', 5)])]


 def get_data(name):

--- a/examples/PennTreebank/PTB-LSTM.py
+++ b/examples/PennTreebank/PTB-LSTM.py
@@ -101,7 +101,7 @@ class Model(ModelDesc):
                        s[1].h.assign(z))

    def get_gradient_processor(self):
-        return [GlobalNormClip(5)]
+        return [gradproc.GlobalNormClip(5)]


 def get_config():

--- a/examples/ResNet/README.md
+++ b/examples/ResNet/README.md
@@ -12,6 +12,10 @@ Models can be [downloaded here](https://goo.gl/6XjK9V).
 | ResNet 50          |      7.13%  |      24.12% |
 | ResNet 101         |      6.54%  |      22.89% |

+```bash
+./imagenet-resnet.py --data /path/to/ILSVRC --gpu 0,1,2,3 -d 18
+```
+
 ![imagenet](imagenet-resnet.png)

 ## load-resnet.py

--- a/examples/ResNet/cifar10-resnet.py
+++ b/examples/ResNet/cifar10-resnet.py
@@ -25,6 +25,9 @@ n=5, about 7.1% val error after 67k steps (8.6 step/s)
 n=18, about 5.95% val error after 80k steps (2.6 step/s)
 n=30: a 182-layer network, about 5.6% val error after 51k steps (1.55 step/s)
 This model uses the whole training set instead of a train-val split.
+
+To train:
+    ./cifar10-resnet.py --gpu 0,1
 """

 BATCH_SIZE = 128

--- a/examples/ResNet/imagenet-resnet.py
+++ b/examples/ResNet/imagenet-resnet.py
@@ -17,11 +17,6 @@ from tensorpack.utils.stats import RatioCounter
 from tensorpack.tfutils.symbolic_functions import *
 from tensorpack.tfutils.summary import *

-"""
-Training code of Pre-Activation version of ResNet on ImageNet.
-It mainly follows the setup in fb.resnet.torch, and get similar performance.
-"""
-
 TOTAL_BATCH_SIZE = 256
 INPUT_SHAPE = 224
 DEPTH = None

--- a/examples/SpatialTransformer/mnist-addition.py
+++ b/examples/SpatialTransformer/mnist-addition.py
@@ -86,8 +86,8 @@ class Model(ModelDesc):
        self.cost = tf.add_n([wd_cost, cost], name='cost')

    def get_gradient_processor(self):
-        return [MapGradient(lambda grad: tf.clip_by_global_norm([grad], 5)[0][0]),
-                ScaleGradient(('STN.*', 0.1)), SummaryGradient()]
+        return [gradproc.ScaleGradient(('STN.*', 0.1)),
+                gradproc.SummaryGradient()]


 def get_data(isTrain):

--- a/examples/mnist-convnet.py
+++ b/examples/mnist-convnet.py
@@ -24,8 +24,10 @@ USE_SLIM = False

 class Model(ModelDesc):
    def _get_inputs(self):
-        """Define all the input variables (with type, shape, name) that'll be
-        fed into the graph to produce a cost.  """
+        """
+        Define all the inputs (with type, shape, name) that
+        the graph will need.
+        """
        return [InputDesc(tf.float32, (None, IMAGE_SIZE, IMAGE_SIZE), 'input'),
                InputDesc(tf.int32, (None,), 'label')]


--- a/tensorpack/models/model_desc.py
+++ b/tensorpack/models/model_desc.py
@@ -9,9 +9,9 @@ import pickle
 import six

 from ..utils import logger, INPUTS_KEY
+from ..utils.argtools import memoized
+from ..tfutils.modelutils import apply_slim_collections
 from ..tfutils.gradproc import CheckGradient
-from ..tfutils.summary import add_moving_summary
-from ..tfutils.tower import get_current_tower_context

 __all__ = ['InputDesc', 'InputVar', 'ModelDesc', 'ModelFromMetaGraph']

@@ -41,8 +41,10 @@ class InputDesc(object):
        return pickle.loads(buf)


-# TODO print warning?
-InputVar = InputDesc
+class InputVar(InputDesc):
+    def __init__(self, *args, **kwargs):
+        logger.warn("[Deprecated] InputVar was renamed to InputDesc!")
+        super(InputVar, self).__init__(*args, **kwargs)


 @six.add_metaclass(ABCMeta)
@@ -50,6 +52,7 @@ class ModelDesc(object):
    """ Base class for a model description """

 # inputs:
+    @memoized
    def get_reused_placehdrs(self):
        """
        Create or return (if already created) raw input TF placeholders in the graph.
@@ -57,11 +60,7 @@ class ModelDesc(object):
        Returns:
            list[tf.Tensor]: the list of input placeholders in the graph.
        """
-        if hasattr(self, 'reuse_input_vars'):
-            return self.reuse_input_vars
-        ret = self.build_placeholders()
-        self.reuse_input_vars = ret
-        return ret
+        return self.build_placeholders()

    def get_input_vars(self):
        # this wasn't a public API anyway
@@ -70,7 +69,7 @@ class ModelDesc(object):

    def build_placeholders(self, prefix=''):
        """
-        For each input, create new placeholders with optional prefix and
+        For each InputDesc, create new placeholders with optional prefix and
        return them. Useful when building new towers.

        Returns:
@@ -105,8 +104,6 @@ class ModelDesc(object):
    def _get_input_vars(self):  # keep backward compatibility
        raise NotImplementedError()

-# build graph:
-
    def build_graph(self, model_inputs):
        """
        Build the whole symbolic graph.
@@ -121,46 +118,35 @@ class ModelDesc(object):
    def _build_graph(self, inputs):
        pass

-# set cost. Only for single-cost model.
    def get_cost(self):
        """
-        Return the cost tensor in the graph. Called by some of the :class:`tensorpack.train.Trainer` which
-        assumes single-cost models.
+        Return the cost tensor in the graph.
+        Used by some of the tensorpack :class:`Trainer` which assumes single-cost models.
+        You can ignore this method if you use your own trainer with more than one cost.

-        This function also apply tfslim collections to the cost automatically, including
-        ``tf.GraphKeys.REGULARIZATION_LOSSES`` and
-        ``tf.GraphKeys.UPDATE_OPS``. This is because slim users would expect
-        the regularizer being automatically applied once used in slim layers.
-        """
+        It calls :meth:`ModelDesc._get_cost()` which by default returns
+        ``self.cost``. You can override :meth:`_get_cost()` if needed.

-        # the model cost so far
+        This function also applies tfslim collections to the cost automatically,
+        including ``tf.GraphKeys.REGULARIZATION_LOSSES`` and ``tf.GraphKeys.UPDATE_OPS``.
+        This is because slim users would expect the regularizer being automatically applied once used in slim layers.
+        """
        cost = self._get_cost()
-
-        regulization_losses = set(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-        if len(regulization_losses) > 0:
-            reg_loss = tf.add_n(list(regulization_losses), name="regularize_loss")
-            cost = tf.add(reg_loss, cost, name='total_cost')
-            add_moving_summary(reg_loss, cost)
-
-        # As these batch-norm statistics quickly accumulate, there is no significant loss of accuracy
-        # if only the main tower handles all batch-normalization updates, which are then shared across
-        # the towers
-        ctx = get_current_tower_context()
-        if ctx is not None and ctx.is_main_training_tower:
-            non_grad_updates = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
-            if non_grad_updates:
-                logger.info("Apply UPDATE_OPS collection on cost.")
-                with tf.control_dependencies(non_grad_updates):
-                    cost = tf.identity(cost)
-        return cost
+        return apply_slim_collections(cost)

    def _get_cost(self, *args):
        return self.cost

-# set optimizer. only for single-optimizer model.
-
+    @memoized
    def get_optimizer(self):
        """
+        Return the optimizer used in the task.
+        Used by some of the tensorpack :class:`Trainer` which only uses a single optimizer.
+        You can ignore this method if you use your own trainer with more than one optimizers.
+
+        Users of :class:`ModelDesc` will need to implement `_get_optimizer()`,
+        which will only be called once per each model.
+
        Returns:
            a :class:`tf.train.Optimizer` instance.
        """
@@ -170,7 +156,7 @@ class ModelDesc(object):
        raise NotImplementedError()

    def get_gradient_processor(self):
-        """ Return a list of :class:`tensorpack.tfutils.GradientProcessor`.
+        """ (Deprecated) Return a list of :class:`tensorpack.tfutils.GradientProcessor`.
            They will be executed by the trainer in the given order.
        """
        return [  # SummaryGradient(),

--- a/tensorpack/tfutils/__init__.py
+++ b/tensorpack/tfutils/__init__.py
@@ -19,7 +19,6 @@ def _global_import(name):
 _TO_IMPORT = set([
    'common',
    'sessinit',
-    'gradproc',
    'argscope',
    'tower'
 ])

--- a/tensorpack/tfutils/gradproc.py
+++ b/tensorpack/tfutils/gradproc.py
@@ -20,7 +20,7 @@ def apply_grad_processors(grads, gradprocs):
    """
    Args:
        grads (list): list of (grad, var).
-        gradprocs (list): list of :class:`GradientProcessor` instances.
+        gradprocs (list[GradientProcessor]): gradient processors to apply.
    Returns:
        list: list of (grad, var) went through the processors.
    """

--- a/tensorpack/tfutils/modelutils.py
+++ b/tensorpack/tfutils/modelutils.py
 # -*- coding: UTF-8 -*-
 # File: modelutils.py
-# Author: Yuxin Wu <ppwwyyxx@gmail.com>
+# Author: tensorpack contributors

 import tensorflow as tf
 from termcolor import colored

 from ..utils import logger
+from .summary import add_moving_summary
+from .tower import get_current_tower_context

-__all__ = ['describe_model', 'get_shape_str']
+__all__ = ['describe_model', 'get_shape_str', 'apply_slim_collections']


 def describe_model():
@@ -46,3 +48,36 @@ def get_shape_str(tensors):
        assert isinstance(tensors, (tf.Tensor, tf.Variable)), "Not a tensor: {}".format(type(tensors))
        shape_str = str(tensors.get_shape().as_list())
    return shape_str
+
+
+def apply_slim_collections(cost):
+    """
+    Apply slim collections to the cost, including:
+
+    1. adding the cost with the regularizers in ``tf.GraphKeys.REGULARIZATION_LOSSES``.
+    2. make the cost depend on ``tf.GraphKeys.UPDATE_OPS``.
+
+    Args:
+        cost: a scalar tensor
+
+    Return:
+        a scalar tensor, the cost after applying the collections.
+    """
+    regulization_losses = set(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+    if len(regulization_losses) > 0:
+        logger.info("Applying REGULARIZATION_LOSSES on cost.")
+        reg_loss = tf.add_n(list(regulization_losses), name="regularize_loss")
+        cost = tf.add(reg_loss, cost, name='total_cost')
+        add_moving_summary(reg_loss, cost)
+
+    # As these batch-norm statistics quickly accumulate, there is no significant loss of accuracy
+    # if only the main tower handles all batch-normalization updates, which are then shared across
+    # the towers
+    ctx = get_current_tower_context()
+    if ctx is not None and ctx.is_main_training_tower:
+        non_grad_updates = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
+        if non_grad_updates:
+            logger.info("Applying UPDATE_OPS collection on cost.")
+            with tf.control_dependencies(non_grad_updates):
+                cost = tf.identity(cost, name='cost_with_update')
+    return cost
--- a/tensorpack/tfutils/optimizer.py
+++ b/tensorpack/tfutils/optimizer.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# File: optimizer.py
+# Author: Yuxin Wu <ppwwyyxxc@gmail.com>
+
+import tensorflow as tf
+from .gradproc import apply_grad_processors as apply_gradproc
+
+
+class ProxyOptimizer(tf.train.Optimizer):
+    def __init__(self, opt):
+        self._opt = opt
+
+    def compute_gradients(self, *args, **kwargs):
+        return self._opt.compute_gradients(*args, **kwargs)
+
+    def get_slot(self, *args, **kwargs):
+        return self._opt.get_slot(*args, **kwargs)
+
+    def get_slot_names(self, *args, **kwargs):
+        return self._opt.get_slot_names(*args, **kwargs)
+
+    def apply_gradients(self, *args, **kwargs):
+        return self._opt.apply_gradients(*args, **kwargs)
+
+
+def apply_grad_processors(opt, gradprocs):
+    """
+    Wrapper around optimizers to apply gradient processors.
+
+    Args:
+        opt (tf.train.Optimizer):
+        gradprocs (list[GradientProcessor]): gradient processors to add to the
+            optimizer.
+    Returns:
+        a :class:`tf.train.Optimizer` instance which runs the gradient
+        processors before updating the variables.
+    """
+
+    class _ApplyGradientProcessor(ProxyOptimizer):
+        def __init__(self, opt, gradprocs):
+            self._gradprocs = gradprocs
+            super(_ApplyGradientProcessor, self).__init__(opt)
+
+        def apply_gradients(self, grads_and_vars,
+                            global_step=None, name=None):
+            g = apply_gradproc(grads_and_vars, self._gradprocs)
+            return self._opt.apply_gradients(g, global_step, name)
+    return _ApplyGradientProcessor(opt, gradprocs)
--- a/tensorpack/train/multigpu.py
+++ b/tensorpack/train/multigpu.py
@@ -82,10 +82,11 @@ class SyncMultiGPUTrainer(MultiGPUTrainer,
        Args:
            config, input_queue: same as in :class:`QueueInputTrainer`.
            average_cost (bool): average the cost (instead of gradients) from
-                each tower and did backprop only once. Should no make
+                each tower and did backprop only once. This option should make no
                difference mathematically, but may affect speed.
        """
        if config.dataflow is not None:
+            # use queueinput by default. May need to avoid this in the future (when more input type is available)
            self._input_method = QueueInput(config.dataflow, input_queue)
        else:
            self._input_method = config.data