add mnist-slim example. Trainer does not automatically summary total cost any more

1b73d9cc · Yuxin Wu · cb99d524 · 1b73d9cc · 1b73d9cc · 1b73d9cc
Commit 1b73d9cc authored Jan 06, 2017 by Yuxin Wu
16 changed files
--- a/examples/Atari2600/DQN.py
+++ b/examples/Atari2600/DQN.py
@@ -136,6 +136,7 @@ class Model(ModelDesc):
                               tf.cast(BATCH_SIZE, tf.float32), name='cost')
        summary.add_param_summary([('conv.*/W', ['histogram', 'rms']),
                                   ('fc.*/W', ['histogram', 'rms'])])   # monitor all W
+        add_moving_summary(self.cost)

    def update_target_param(self):
        vars = tf.trainable_variables()

--- a/examples/DisturbLabel/mnist-disturb.py
+++ b/examples/DisturbLabel/mnist-disturb.py
@@ -51,9 +51,9 @@ class Model(mnist_example.Model):
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
        wd_cost = tf.mul(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss),
                         name='regularize_loss')
-        add_moving_summary(cost, wd_cost)

        self.cost = tf.add_n([wd_cost, cost], name='cost')
+        add_moving_summary(cost, wd_cost, self.cost)


 if __name__ == '__main__':

--- a/examples/DoReFa-Net/alexnet-dorefa.py
+++ b/examples/DoReFa-Net/alexnet-dorefa.py
@@ -156,10 +156,10 @@ class Model(ModelDesc):

        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6))
-        add_moving_summary(cost, wd_cost)

        add_param_summary([('.*/W', ['histogram', 'rms'])])
        self.cost = tf.add_n([cost, wd_cost], name='cost')
+        add_moving_summary(cost, wd_cost, self.cost)


 def get_data(dataset_name):

--- a/examples/DoReFa-Net/svhn-digit-dorefa.py
+++ b/examples/DoReFa-Net/svhn-digit-dorefa.py
@@ -121,10 +121,10 @@ class Model(ModelDesc):
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7))
-        add_moving_summary(cost, wd_cost)

        add_param_summary([('.*/W', ['histogram', 'rms'])])
        self.cost = tf.add_n([cost, wd_cost], name='cost')
+        add_moving_summary(cost, wd_cost, self.cost)


 def get_config():

--- a/examples/HED/hed.py
+++ b/examples/HED/hed.py
@@ -89,9 +89,9 @@ class Model(ModelDesc):
            wd_cost = tf.mul(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost')
            costs.append(wd_cost)

-            add_moving_summary(costs + [wrong])
            add_param_summary([('.*/W', ['histogram'])])   # monitor W
            self.cost = tf.add_n(costs, name='cost')
+            add_moving_summary(costs + [wrong, self.cost])

    def get_gradient_processor(self):
        return [ScaleGradient([('convfcweight.*', 0.1), ('conv5_.*', 5)])]

--- a/examples/Inception/inception-bn.py
+++ b/examples/Inception/inception-bn.py
@@ -114,10 +114,10 @@ class Model(ModelDesc):
        wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(),
                                          80000, 0.7, True)
        wd_cost = tf.mul(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='l2_regularize_loss')
-        add_moving_summary(wd_cost)

        add_param_summary([('.*/W', ['histogram'])])   # monitor W
        self.cost = tf.add_n([cost, wd_cost], name='cost')
+        add_moving_summary(wd_cost, self.cost)


 def get_data(train_or_test):

--- a/examples/Inception/inceptionv3.py
+++ b/examples/Inception/inceptionv3.py
@@ -193,9 +193,9 @@ class Model(ModelDesc):
        wd_w = tf.train.exponential_decay(0.00004, get_global_step_var(),
                                          80000, 0.7, True)
        wd_cost = tf.mul(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='l2_regularize_loss')
-        add_moving_summary(loss1, loss2, wd_cost)

        self.cost = tf.add_n([0.4 * loss1, loss2, wd_cost], name='cost')
+        add_moving_summary(loss1, loss2, wd_cost, self.cost)


 def get_data(train_or_test):

--- a/examples/OpenAIGym/train-atari.py
+++ b/examples/OpenAIGym/train-atari.py
@@ -122,13 +122,14 @@ class Model(ModelDesc):

        pred_reward = tf.reduce_mean(self.value, name='predict_reward')
        advantage = symbf.rms(advantage, name='rms_advantage')
-        summary.add_moving_summary(policy_loss, xentropy_loss, value_loss, pred_reward, advantage)
        entropy_beta = tf.get_variable('entropy_beta', shape=[],
                                       initializer=tf.constant_initializer(0.01), trainable=False)
        self.cost = tf.add_n([policy_loss, xentropy_loss * entropy_beta, value_loss])
        self.cost = tf.truediv(self.cost,
                               tf.cast(tf.shape(futurereward)[0], tf.float32),
                               name='cost')
+        summary.add_moving_summary(policy_loss, xentropy_loss,
+                                   value_loss, pred_reward, advantage, self.cost)

    def get_gradient_processor(self):
        return [MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1)),

--- a/examples/TIMIT/train-timit.py
+++ b/examples/TIMIT/train-timit.py
@@ -72,7 +72,7 @@ class Model(ModelDesc):
        err = tf.edit_distance(predictions, label, normalize=True)
        err.set_shape([None])
        err = tf.reduce_mean(err, name='error')
-        summary.add_moving_summary(err)
+        summary.add_moving_summary(err, self.cost)

    def get_gradient_processor(self):
        return [GlobalNormClip(5), SummaryGradient()]

--- a/examples/char-rnn/char-rnn.py
+++ b/examples/char-rnn/char-rnn.py
@@ -92,6 +92,7 @@ class Model(ModelDesc):
            logits, symbolic_functions.flatten(nextinput))
        self.cost = tf.reduce_mean(xent_loss, name='cost')
        summary.add_param_summary([('.*/W', ['histogram'])])   # monitor histogram of all W
+        summary.add_moving_summary(self.cost)

    def get_gradient_processor(self):
        return [GlobalNormClip(5)]

--- a/examples/mnist-convnet.py
+++ b/examples/mnist-convnet.py
@@ -5,6 +5,7 @@

 import numpy as np
 import tensorflow as tf
+import tensorflow.contrib.slim as slim
 import os
 import sys
 import argparse
@@ -18,6 +19,7 @@ about 0.6% validation error after 30 epochs.
 from tensorpack import *

 IMAGE_SIZE = 28
+USE_SLIM = False


 class Model(ModelDesc):
@@ -39,15 +41,30 @@ class Model(ModelDesc):
        image = tf.expand_dims(image, 3)

        image = image * 2 - 1   # center the pixels values at zero
+
+        if USE_SLIM:
+            is_training = get_current_tower_context().is_training
+            with slim.arg_scope([slim.layers.fully_connected],
+                                weights_regularizer=slim.l2_regularizer(1e-5)):
+                l = slim.layers.conv2d(image, 32, [3, 3], scope='conv0')
+                l = slim.layers.max_pool2d(l, [2, 2], scope='pool0')
+                l = slim.layers.conv2d(l, 32, [3, 3], padding='SAME', scope='conv1')
+                l = slim.layers.conv2d(l, 32, [3, 3], scope='conv2')
+                l = slim.layers.max_pool2d(l, [2, 2], scope='pool1')
+                l = slim.layers.conv2d(l, 32, [3, 3], scope='conv3')
+                l = slim.layers.flatten(l, scope='flatten')
+                l = slim.layers.fully_connected(l, 512, scope='fc0')
+                l = slim.layers.dropout(l, is_training=is_training)
+                logits = slim.layers.fully_connected(l, 10, activation_fn=None, scope='fc1')
+        else:
            # The context manager `argscope` sets the default option for all the layers under
-        # this context. Here we use 32 channel convolution with shape 3x3 and
-        # PReLU as nonlinearity.
-        with argscope(Conv2D, kernel_shape=3, nl=PReLU.f, out_channel=32):
+            # this context. Here we use 32 channel convolution with shape 3x3
+            with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu, out_channel=32):
                """
                LinearWrap is just a convenient way to compose a linear symbolic graph.
                You can also do the equivalent in tensorflow style:
                l = Conv2D('conv0', image)
-            l = MaxPooling('pool0', image, 2)
+                l = MaxPooling('pool0', l, 2)
                ...  """

                logits = (LinearWrap(image)  # the starting brace is only for line-breaking
@@ -62,8 +79,8 @@ class Model(ModelDesc):
                          .FullyConnected('fc1', out_dim=10, nl=tf.identity)())
        prob = tf.nn.softmax(logits, name='prob')   # a Bx10 with probabilities

-        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(
-            logits, label)  # a vector of length B with loss of each sample
+        # a vector of length B with loss of each sample
+        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')  # the average cross-entropy loss

        # compute the "incorrect vector", for the callback ClassificationError to use at validation time
@@ -76,16 +93,23 @@ class Model(ModelDesc):
        train_error = tf.reduce_mean(wrong, name='train_error')
        summary.add_moving_summary(train_error)

+        if not USE_SLIM:
            # Use a regex to find parameters to apply weight decay.
            # Here we apply a weight decay on all W (weight matrix) of all fc layers
            wd_cost = tf.mul(1e-5,
                             regularize_cost('fc.*/W', tf.nn.l2_loss),
                             name='regularize_loss')
-        summary.add_moving_summary(cost, wd_cost)
+            self.cost = tf.add_n([wd_cost, cost], name='total_cost')
+            summary.add_moving_summary(cost, wd_cost, self.cost)
+        else:
+            # slim already adds regularization to a collection, no extra handling
+            self.cost = cost
+            summary.add_moving_summary(cost)

        # monitor histogram of all weight (of conv and fc layers) in tensorboard
-        summary.add_param_summary([('.*/W', ['histogram'])])
-        self.cost = tf.add_n([wd_cost, cost], name='cost')
+        summary.add_param_summary([('.*/W', ['histogram', 'rms']),
+                                   ('.*/weights', ['histogram', 'rms'])  # to also work with slim
+                                   ])


 def get_data():
@@ -122,7 +146,7 @@ def get_config():
            InferenceRunner(    # run inference(for validation) after every epoch
                dataset_test,   # the DataFlow instance used for validation
                # Calculate both the cost and the error for this DataFlow
-                [ScalarStats('cost'), ClassificationError('incorrect')]),
+                [ScalarStats('cross_entropy_loss'), ClassificationError('incorrect')]),
        ]),
        model=Model(),
        step_per_epoch=step_per_epoch,

--- a/tensorpack/models/batch_norm.py
+++ b/tensorpack/models/batch_norm.py
@@ -112,6 +112,7 @@ def BatchNormV2(x, use_local_stat=None, decay=0.9, epsilon=1e-5):

    Note:
        * In multi-tower training, only the first training tower maintains a moving average.
+          This is consistent with most frameworks.

        * It automatically selects :meth:`BatchNormV1` or :meth:`BatchNormV2`
          according to availability.

--- a/tensorpack/models/model_desc.py
+++ b/tensorpack/models/model_desc.py
@@ -11,6 +11,7 @@ import six

 from ..utils import logger, INPUT_VARS_KEY
 from ..tfutils.gradproc import CheckGradient
+from ..tfutils.summary import add_moving_summary
 from ..tfutils.tower import get_current_tower_context

 __all__ = ['ModelDesc', 'InputVar', 'ModelFromMetaGraph']
@@ -113,42 +114,32 @@ Use _build_graph(self, input_vars) and get_current_tower_context().is_training i
    def get_cost(self):
        """
        Return the cost tensor in the graph. Called by some of the :class:`tensorpack.train.Trainer` which
-        assumes single-cost models. Apply tfSlim modifications.
-        """
+        assumes single-cost models.

-        # current scope
-        scope = tf.get_variable_scope()
+        This function also apply tfslim collections to the cost automatically, including
+        ``tf.GraphKeys.REGULARIZATION_LOSSES`` and
+        ``tf.GraphKeys.UPDATE_OPS``. This is because slim users would expect
+        the regularizer being automatically applied once used in slim layers.
+        """

        # the model cost so far
        cost = self._get_cost()

-        # In contrast to this lib, when using tfSlim the user expect
-        #         "with slim.arg_scope([...], weights_regularizer=slim.l2_regularizer(0.001)"
-        # to regularize these layers automatically. Note, this already contains the multiplier!
-        regulization_losses = 0
-        # try to prevent regEx error, iff scope name is empty ("")
-        try:
-            regulization_losses = set(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope=scope))
-        except Exception:
        regulization_losses = set(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
-
-        # TODO: check if "scope=scope" should be used here too
        if len(regulization_losses) > 0:
-            cost += tf.add_n(regulization_losses, name="regularize_loss")
+            reg_loss = tf.add_n(list(regulization_losses), name="regularize_loss")
+            cost = tf.add(reg_loss, cost, name='total_cost')
+            add_moving_summary(reg_loss, cost)

        # As these batch-norm statistics quickly accumulate, there is no significant loss of accuracy
        # if only the main tower handles all batch-normalization updates, which are then shared across
        # the towers
        ctx = get_current_tower_context()
        if ctx is not None and ctx.is_main_training_tower:
-            # if there is no entry in tf.GraphKeys.UPDATE_OPS, then there is a regEx exception
-            try:
-                non_grad_updates = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope=scope))
-            except Exception:
            non_grad_updates = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
            if non_grad_updates:
                with tf.control_dependencies(non_grad_updates):
-                        barrier = tf.control_flow_ops.no_op(name='batchnorm_barrier')
+                    barrier = tf.control_flow_ops.no_op(name='update_ops_barrier')
                cost = tf.control_flow_ops.with_dependencies([barrier], cost)
        return cost


--- a/tensorpack/tfutils/summary.py
+++ b/tensorpack/tfutils/summary.py
@@ -116,7 +116,7 @@ def summary_moving_average(tensors=None):
    :returns: a op to maintain these average.
    """
    if tensors is None:
-        tensors = tf.get_collection(MOVING_SUMMARY_VARS_KEY)
+        tensors = set(tf.get_collection(MOVING_SUMMARY_VARS_KEY))

    # TODO will produce tower0/xxx. not elegant
    with tf.name_scope(None):

--- a/tensorpack/train/feedfree.py
+++ b/tensorpack/train/feedfree.py
@@ -9,7 +9,7 @@ from ..utils import logger
 from ..tfutils import get_global_step_var
 from ..tfutils.tower import TowerContext
 from ..tfutils.gradproc import apply_grad_processors
-from ..tfutils.summary import summary_moving_average, add_moving_summary
+from ..tfutils.summary import summary_moving_average
 from .input_data import QueueInput, FeedfreeInput

 from .base import Trainer
@@ -51,7 +51,6 @@ class SingleCostFeedfreeTrainer(FeedfreeTrainerBase):
            cost_var,
            gate_gradients=tf.train.Optimizer.GATE_NONE,
            colocate_gradients_with_ops=False)
-        add_moving_summary(cost_var)
        return cost_var, grads

    def run_step(self):

--- a/tensorpack/train/trainer.py
+++ b/tensorpack/train/trainer.py
@@ -9,7 +9,7 @@ from .base import Trainer
 from ..utils import SUMMARY_BACKUP_KEYS, PREDICT_TOWER
 from ..tfutils import (get_tensors_by_names, freeze_collection,
                       get_global_step_var, TowerContext)
-from ..tfutils.summary import summary_moving_average, add_moving_summary
+from ..tfutils.summary import summary_moving_average
 from ..predict import OnlinePredictor, build_multi_tower_prediction_graph
 from ..tfutils.gradproc import apply_grad_processors
 from .input_data import FeedInput
@@ -82,7 +82,6 @@ class SimpleTrainer(Trainer):
        with TowerContext('', is_training=True):
            model.build_graph(self.input_vars)
            cost_var = model.get_cost()
-            add_moving_summary(cost_var)

        grads = self.config.optimizer.compute_gradients(cost_var)
        grads = apply_grad_processors(grads,