Commit 1b73d9cc authored by Yuxin Wu's avatar Yuxin Wu

add mnist-slim example. Trainer does not automatically summary total cost any more

parent cb99d524
......@@ -136,6 +136,7 @@ class Model(ModelDesc):
tf.cast(BATCH_SIZE, tf.float32), name='cost')
summary.add_param_summary([('conv.*/W', ['histogram', 'rms']),
('fc.*/W', ['histogram', 'rms'])]) # monitor all W
add_moving_summary(self.cost)
def update_target_param(self):
vars = tf.trainable_variables()
......
......@@ -51,9 +51,9 @@ class Model(mnist_example.Model):
cost = tf.reduce_mean(cost, name='cross_entropy_loss')
wd_cost = tf.mul(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss),
name='regularize_loss')
add_moving_summary(cost, wd_cost)
self.cost = tf.add_n([wd_cost, cost], name='cost')
add_moving_summary(cost, wd_cost, self.cost)
if __name__ == '__main__':
......
......@@ -156,10 +156,10 @@ class Model(ModelDesc):
# weight decay on all W of fc layers
wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6))
add_moving_summary(cost, wd_cost)
add_param_summary([('.*/W', ['histogram', 'rms'])])
self.cost = tf.add_n([cost, wd_cost], name='cost')
add_moving_summary(cost, wd_cost, self.cost)
def get_data(dataset_name):
......
......@@ -121,10 +121,10 @@ class Model(ModelDesc):
cost = tf.reduce_mean(cost, name='cross_entropy_loss')
# weight decay on all W of fc layers
wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7))
add_moving_summary(cost, wd_cost)
add_param_summary([('.*/W', ['histogram', 'rms'])])
self.cost = tf.add_n([cost, wd_cost], name='cost')
add_moving_summary(cost, wd_cost, self.cost)
def get_config():
......
......@@ -89,9 +89,9 @@ class Model(ModelDesc):
wd_cost = tf.mul(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost')
costs.append(wd_cost)
add_moving_summary(costs + [wrong])
add_param_summary([('.*/W', ['histogram'])]) # monitor W
self.cost = tf.add_n(costs, name='cost')
add_moving_summary(costs + [wrong, self.cost])
def get_gradient_processor(self):
return [ScaleGradient([('convfcweight.*', 0.1), ('conv5_.*', 5)])]
......
......@@ -114,10 +114,10 @@ class Model(ModelDesc):
wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(),
80000, 0.7, True)
wd_cost = tf.mul(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='l2_regularize_loss')
add_moving_summary(wd_cost)
add_param_summary([('.*/W', ['histogram'])]) # monitor W
self.cost = tf.add_n([cost, wd_cost], name='cost')
add_moving_summary(wd_cost, self.cost)
def get_data(train_or_test):
......
......@@ -193,9 +193,9 @@ class Model(ModelDesc):
wd_w = tf.train.exponential_decay(0.00004, get_global_step_var(),
80000, 0.7, True)
wd_cost = tf.mul(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='l2_regularize_loss')
add_moving_summary(loss1, loss2, wd_cost)
self.cost = tf.add_n([0.4 * loss1, loss2, wd_cost], name='cost')
add_moving_summary(loss1, loss2, wd_cost, self.cost)
def get_data(train_or_test):
......
......@@ -122,13 +122,14 @@ class Model(ModelDesc):
pred_reward = tf.reduce_mean(self.value, name='predict_reward')
advantage = symbf.rms(advantage, name='rms_advantage')
summary.add_moving_summary(policy_loss, xentropy_loss, value_loss, pred_reward, advantage)
entropy_beta = tf.get_variable('entropy_beta', shape=[],
initializer=tf.constant_initializer(0.01), trainable=False)
self.cost = tf.add_n([policy_loss, xentropy_loss * entropy_beta, value_loss])
self.cost = tf.truediv(self.cost,
tf.cast(tf.shape(futurereward)[0], tf.float32),
name='cost')
summary.add_moving_summary(policy_loss, xentropy_loss,
value_loss, pred_reward, advantage, self.cost)
def get_gradient_processor(self):
return [MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1)),
......
......@@ -72,7 +72,7 @@ class Model(ModelDesc):
err = tf.edit_distance(predictions, label, normalize=True)
err.set_shape([None])
err = tf.reduce_mean(err, name='error')
summary.add_moving_summary(err)
summary.add_moving_summary(err, self.cost)
def get_gradient_processor(self):
return [GlobalNormClip(5), SummaryGradient()]
......
......@@ -92,6 +92,7 @@ class Model(ModelDesc):
logits, symbolic_functions.flatten(nextinput))
self.cost = tf.reduce_mean(xent_loss, name='cost')
summary.add_param_summary([('.*/W', ['histogram'])]) # monitor histogram of all W
summary.add_moving_summary(self.cost)
def get_gradient_processor(self):
return [GlobalNormClip(5)]
......
......@@ -5,6 +5,7 @@
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
import os
import sys
import argparse
......@@ -18,6 +19,7 @@ about 0.6% validation error after 30 epochs.
from tensorpack import *
IMAGE_SIZE = 28
USE_SLIM = False
class Model(ModelDesc):
......@@ -39,31 +41,46 @@ class Model(ModelDesc):
image = tf.expand_dims(image, 3)
image = image * 2 - 1 # center the pixels values at zero
# The context manager `argscope` sets the default option for all the layers under
# this context. Here we use 32 channel convolution with shape 3x3 and
# PReLU as nonlinearity.
with argscope(Conv2D, kernel_shape=3, nl=PReLU.f, out_channel=32):
"""
LinearWrap is just a convenient way to compose a linear symbolic graph.
You can also do the equivalent in tensorflow style:
l = Conv2D('conv0', image)
l = MaxPooling('pool0', image, 2)
... """
logits = (LinearWrap(image) # the starting brace is only for line-breaking
.Conv2D('conv0')
.MaxPooling('pool0', 2)
.Conv2D('conv1', padding='SAME')
.Conv2D('conv2')
.MaxPooling('pool1', 2)
.Conv2D('conv3')
.FullyConnected('fc0', 512, nl=tf.nn.relu)
.Dropout('dropout', 0.5)
.FullyConnected('fc1', out_dim=10, nl=tf.identity)())
if USE_SLIM:
is_training = get_current_tower_context().is_training
with slim.arg_scope([slim.layers.fully_connected],
weights_regularizer=slim.l2_regularizer(1e-5)):
l = slim.layers.conv2d(image, 32, [3, 3], scope='conv0')
l = slim.layers.max_pool2d(l, [2, 2], scope='pool0')
l = slim.layers.conv2d(l, 32, [3, 3], padding='SAME', scope='conv1')
l = slim.layers.conv2d(l, 32, [3, 3], scope='conv2')
l = slim.layers.max_pool2d(l, [2, 2], scope='pool1')
l = slim.layers.conv2d(l, 32, [3, 3], scope='conv3')
l = slim.layers.flatten(l, scope='flatten')
l = slim.layers.fully_connected(l, 512, scope='fc0')
l = slim.layers.dropout(l, is_training=is_training)
logits = slim.layers.fully_connected(l, 10, activation_fn=None, scope='fc1')
else:
# The context manager `argscope` sets the default option for all the layers under
# this context. Here we use 32 channel convolution with shape 3x3
with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu, out_channel=32):
"""
LinearWrap is just a convenient way to compose a linear symbolic graph.
You can also do the equivalent in tensorflow style:
l = Conv2D('conv0', image)
l = MaxPooling('pool0', l, 2)
... """
logits = (LinearWrap(image) # the starting brace is only for line-breaking
.Conv2D('conv0')
.MaxPooling('pool0', 2)
.Conv2D('conv1', padding='SAME')
.Conv2D('conv2')
.MaxPooling('pool1', 2)
.Conv2D('conv3')
.FullyConnected('fc0', 512, nl=tf.nn.relu)
.Dropout('dropout', 0.5)
.FullyConnected('fc1', out_dim=10, nl=tf.identity)())
prob = tf.nn.softmax(logits, name='prob') # a Bx10 with probabilities
cost = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits, label) # a vector of length B with loss of each sample
# a vector of length B with loss of each sample
cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label)
cost = tf.reduce_mean(cost, name='cross_entropy_loss') # the average cross-entropy loss
# compute the "incorrect vector", for the callback ClassificationError to use at validation time
......@@ -76,16 +93,23 @@ class Model(ModelDesc):
train_error = tf.reduce_mean(wrong, name='train_error')
summary.add_moving_summary(train_error)
# Use a regex to find parameters to apply weight decay.
# Here we apply a weight decay on all W (weight matrix) of all fc layers
wd_cost = tf.mul(1e-5,
regularize_cost('fc.*/W', tf.nn.l2_loss),
name='regularize_loss')
summary.add_moving_summary(cost, wd_cost)
if not USE_SLIM:
# Use a regex to find parameters to apply weight decay.
# Here we apply a weight decay on all W (weight matrix) of all fc layers
wd_cost = tf.mul(1e-5,
regularize_cost('fc.*/W', tf.nn.l2_loss),
name='regularize_loss')
self.cost = tf.add_n([wd_cost, cost], name='total_cost')
summary.add_moving_summary(cost, wd_cost, self.cost)
else:
# slim already adds regularization to a collection, no extra handling
self.cost = cost
summary.add_moving_summary(cost)
# monitor histogram of all weight (of conv and fc layers) in tensorboard
summary.add_param_summary([('.*/W', ['histogram'])])
self.cost = tf.add_n([wd_cost, cost], name='cost')
summary.add_param_summary([('.*/W', ['histogram', 'rms']),
('.*/weights', ['histogram', 'rms']) # to also work with slim
])
def get_data():
......@@ -122,7 +146,7 @@ def get_config():
InferenceRunner( # run inference(for validation) after every epoch
dataset_test, # the DataFlow instance used for validation
# Calculate both the cost and the error for this DataFlow
[ScalarStats('cost'), ClassificationError('incorrect')]),
[ScalarStats('cross_entropy_loss'), ClassificationError('incorrect')]),
]),
model=Model(),
step_per_epoch=step_per_epoch,
......
......@@ -112,6 +112,7 @@ def BatchNormV2(x, use_local_stat=None, decay=0.9, epsilon=1e-5):
Note:
* In multi-tower training, only the first training tower maintains a moving average.
This is consistent with most frameworks.
* It automatically selects :meth:`BatchNormV1` or :meth:`BatchNormV2`
according to availability.
......
......@@ -11,6 +11,7 @@ import six
from ..utils import logger, INPUT_VARS_KEY
from ..tfutils.gradproc import CheckGradient
from ..tfutils.summary import add_moving_summary
from ..tfutils.tower import get_current_tower_context
__all__ = ['ModelDesc', 'InputVar', 'ModelFromMetaGraph']
......@@ -113,43 +114,33 @@ Use _build_graph(self, input_vars) and get_current_tower_context().is_training i
def get_cost(self):
"""
Return the cost tensor in the graph. Called by some of the :class:`tensorpack.train.Trainer` which
assumes single-cost models. Apply tfSlim modifications.
"""
assumes single-cost models.
# current scope
scope = tf.get_variable_scope()
This function also apply tfslim collections to the cost automatically, including
``tf.GraphKeys.REGULARIZATION_LOSSES`` and
``tf.GraphKeys.UPDATE_OPS``. This is because slim users would expect
the regularizer being automatically applied once used in slim layers.
"""
# the model cost so far
cost = self._get_cost()
# In contrast to this lib, when using tfSlim the user expect
# "with slim.arg_scope([...], weights_regularizer=slim.l2_regularizer(0.001)"
# to regularize these layers automatically. Note, this already contains the multiplier!
regulization_losses = 0
# try to prevent regEx error, iff scope name is empty ("")
try:
regulization_losses = set(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope=scope))
except Exception:
regulization_losses = set(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
# TODO: check if "scope=scope" should be used here too
regulization_losses = set(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
if len(regulization_losses) > 0:
cost += tf.add_n(regulization_losses, name="regularize_loss")
reg_loss = tf.add_n(list(regulization_losses), name="regularize_loss")
cost = tf.add(reg_loss, cost, name='total_cost')
add_moving_summary(reg_loss, cost)
# As these batch-norm statistics quickly accumulate, there is no significant loss of accuracy
# if only the main tower handles all batch-normalization updates, which are then shared across
# the towers
ctx = get_current_tower_context()
if ctx is not None and ctx.is_main_training_tower:
# if there is no entry in tf.GraphKeys.UPDATE_OPS, then there is a regEx exception
try:
non_grad_updates = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope=scope))
except Exception:
non_grad_updates = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
non_grad_updates = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
if non_grad_updates:
with tf.control_dependencies(non_grad_updates):
barrier = tf.control_flow_ops.no_op(name='batchnorm_barrier')
cost = tf.control_flow_ops.with_dependencies([barrier], cost)
with tf.control_dependencies(non_grad_updates):
barrier = tf.control_flow_ops.no_op(name='update_ops_barrier')
cost = tf.control_flow_ops.with_dependencies([barrier], cost)
return cost
def _get_cost(self, *args):
......
......@@ -116,7 +116,7 @@ def summary_moving_average(tensors=None):
:returns: a op to maintain these average.
"""
if tensors is None:
tensors = tf.get_collection(MOVING_SUMMARY_VARS_KEY)
tensors = set(tf.get_collection(MOVING_SUMMARY_VARS_KEY))
# TODO will produce tower0/xxx. not elegant
with tf.name_scope(None):
......
......@@ -9,7 +9,7 @@ from ..utils import logger
from ..tfutils import get_global_step_var
from ..tfutils.tower import TowerContext
from ..tfutils.gradproc import apply_grad_processors
from ..tfutils.summary import summary_moving_average, add_moving_summary
from ..tfutils.summary import summary_moving_average
from .input_data import QueueInput, FeedfreeInput
from .base import Trainer
......@@ -51,7 +51,6 @@ class SingleCostFeedfreeTrainer(FeedfreeTrainerBase):
cost_var,
gate_gradients=tf.train.Optimizer.GATE_NONE,
colocate_gradients_with_ops=False)
add_moving_summary(cost_var)
return cost_var, grads
def run_step(self):
......
......@@ -9,7 +9,7 @@ from .base import Trainer
from ..utils import SUMMARY_BACKUP_KEYS, PREDICT_TOWER
from ..tfutils import (get_tensors_by_names, freeze_collection,
get_global_step_var, TowerContext)
from ..tfutils.summary import summary_moving_average, add_moving_summary
from ..tfutils.summary import summary_moving_average
from ..predict import OnlinePredictor, build_multi_tower_prediction_graph
from ..tfutils.gradproc import apply_grad_processors
from .input_data import FeedInput
......@@ -82,7 +82,6 @@ class SimpleTrainer(Trainer):
with TowerContext('', is_training=True):
model.build_graph(self.input_vars)
cost_var = model.get_cost()
add_moving_summary(cost_var)
grads = self.config.optimizer.compute_gradients(cost_var)
grads = apply_grad_processors(grads,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment