Commit 1b73d9cc authored by Yuxin Wu's avatar Yuxin Wu

add mnist-slim example. Trainer does not automatically summary total cost any more

parent cb99d524
...@@ -136,6 +136,7 @@ class Model(ModelDesc): ...@@ -136,6 +136,7 @@ class Model(ModelDesc):
tf.cast(BATCH_SIZE, tf.float32), name='cost') tf.cast(BATCH_SIZE, tf.float32), name='cost')
summary.add_param_summary([('conv.*/W', ['histogram', 'rms']), summary.add_param_summary([('conv.*/W', ['histogram', 'rms']),
('fc.*/W', ['histogram', 'rms'])]) # monitor all W ('fc.*/W', ['histogram', 'rms'])]) # monitor all W
add_moving_summary(self.cost)
def update_target_param(self): def update_target_param(self):
vars = tf.trainable_variables() vars = tf.trainable_variables()
......
...@@ -51,9 +51,9 @@ class Model(mnist_example.Model): ...@@ -51,9 +51,9 @@ class Model(mnist_example.Model):
cost = tf.reduce_mean(cost, name='cross_entropy_loss') cost = tf.reduce_mean(cost, name='cross_entropy_loss')
wd_cost = tf.mul(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss), wd_cost = tf.mul(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss),
name='regularize_loss') name='regularize_loss')
add_moving_summary(cost, wd_cost)
self.cost = tf.add_n([wd_cost, cost], name='cost') self.cost = tf.add_n([wd_cost, cost], name='cost')
add_moving_summary(cost, wd_cost, self.cost)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -156,10 +156,10 @@ class Model(ModelDesc): ...@@ -156,10 +156,10 @@ class Model(ModelDesc):
# weight decay on all W of fc layers # weight decay on all W of fc layers
wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6)) wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6))
add_moving_summary(cost, wd_cost)
add_param_summary([('.*/W', ['histogram', 'rms'])]) add_param_summary([('.*/W', ['histogram', 'rms'])])
self.cost = tf.add_n([cost, wd_cost], name='cost') self.cost = tf.add_n([cost, wd_cost], name='cost')
add_moving_summary(cost, wd_cost, self.cost)
def get_data(dataset_name): def get_data(dataset_name):
......
...@@ -121,10 +121,10 @@ class Model(ModelDesc): ...@@ -121,10 +121,10 @@ class Model(ModelDesc):
cost = tf.reduce_mean(cost, name='cross_entropy_loss') cost = tf.reduce_mean(cost, name='cross_entropy_loss')
# weight decay on all W of fc layers # weight decay on all W of fc layers
wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7)) wd_cost = regularize_cost('fc.*/W', l2_regularizer(1e-7))
add_moving_summary(cost, wd_cost)
add_param_summary([('.*/W', ['histogram', 'rms'])]) add_param_summary([('.*/W', ['histogram', 'rms'])])
self.cost = tf.add_n([cost, wd_cost], name='cost') self.cost = tf.add_n([cost, wd_cost], name='cost')
add_moving_summary(cost, wd_cost, self.cost)
def get_config(): def get_config():
......
...@@ -89,9 +89,9 @@ class Model(ModelDesc): ...@@ -89,9 +89,9 @@ class Model(ModelDesc):
wd_cost = tf.mul(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost') wd_cost = tf.mul(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost')
costs.append(wd_cost) costs.append(wd_cost)
add_moving_summary(costs + [wrong])
add_param_summary([('.*/W', ['histogram'])]) # monitor W add_param_summary([('.*/W', ['histogram'])]) # monitor W
self.cost = tf.add_n(costs, name='cost') self.cost = tf.add_n(costs, name='cost')
add_moving_summary(costs + [wrong, self.cost])
def get_gradient_processor(self): def get_gradient_processor(self):
return [ScaleGradient([('convfcweight.*', 0.1), ('conv5_.*', 5)])] return [ScaleGradient([('convfcweight.*', 0.1), ('conv5_.*', 5)])]
......
...@@ -114,10 +114,10 @@ class Model(ModelDesc): ...@@ -114,10 +114,10 @@ class Model(ModelDesc):
wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(), wd_w = tf.train.exponential_decay(0.0002, get_global_step_var(),
80000, 0.7, True) 80000, 0.7, True)
wd_cost = tf.mul(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='l2_regularize_loss') wd_cost = tf.mul(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='l2_regularize_loss')
add_moving_summary(wd_cost)
add_param_summary([('.*/W', ['histogram'])]) # monitor W add_param_summary([('.*/W', ['histogram'])]) # monitor W
self.cost = tf.add_n([cost, wd_cost], name='cost') self.cost = tf.add_n([cost, wd_cost], name='cost')
add_moving_summary(wd_cost, self.cost)
def get_data(train_or_test): def get_data(train_or_test):
......
...@@ -193,9 +193,9 @@ class Model(ModelDesc): ...@@ -193,9 +193,9 @@ class Model(ModelDesc):
wd_w = tf.train.exponential_decay(0.00004, get_global_step_var(), wd_w = tf.train.exponential_decay(0.00004, get_global_step_var(),
80000, 0.7, True) 80000, 0.7, True)
wd_cost = tf.mul(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='l2_regularize_loss') wd_cost = tf.mul(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='l2_regularize_loss')
add_moving_summary(loss1, loss2, wd_cost)
self.cost = tf.add_n([0.4 * loss1, loss2, wd_cost], name='cost') self.cost = tf.add_n([0.4 * loss1, loss2, wd_cost], name='cost')
add_moving_summary(loss1, loss2, wd_cost, self.cost)
def get_data(train_or_test): def get_data(train_or_test):
......
...@@ -122,13 +122,14 @@ class Model(ModelDesc): ...@@ -122,13 +122,14 @@ class Model(ModelDesc):
pred_reward = tf.reduce_mean(self.value, name='predict_reward') pred_reward = tf.reduce_mean(self.value, name='predict_reward')
advantage = symbf.rms(advantage, name='rms_advantage') advantage = symbf.rms(advantage, name='rms_advantage')
summary.add_moving_summary(policy_loss, xentropy_loss, value_loss, pred_reward, advantage)
entropy_beta = tf.get_variable('entropy_beta', shape=[], entropy_beta = tf.get_variable('entropy_beta', shape=[],
initializer=tf.constant_initializer(0.01), trainable=False) initializer=tf.constant_initializer(0.01), trainable=False)
self.cost = tf.add_n([policy_loss, xentropy_loss * entropy_beta, value_loss]) self.cost = tf.add_n([policy_loss, xentropy_loss * entropy_beta, value_loss])
self.cost = tf.truediv(self.cost, self.cost = tf.truediv(self.cost,
tf.cast(tf.shape(futurereward)[0], tf.float32), tf.cast(tf.shape(futurereward)[0], tf.float32),
name='cost') name='cost')
summary.add_moving_summary(policy_loss, xentropy_loss,
value_loss, pred_reward, advantage, self.cost)
def get_gradient_processor(self): def get_gradient_processor(self):
return [MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1)), return [MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1)),
......
...@@ -72,7 +72,7 @@ class Model(ModelDesc): ...@@ -72,7 +72,7 @@ class Model(ModelDesc):
err = tf.edit_distance(predictions, label, normalize=True) err = tf.edit_distance(predictions, label, normalize=True)
err.set_shape([None]) err.set_shape([None])
err = tf.reduce_mean(err, name='error') err = tf.reduce_mean(err, name='error')
summary.add_moving_summary(err) summary.add_moving_summary(err, self.cost)
def get_gradient_processor(self): def get_gradient_processor(self):
return [GlobalNormClip(5), SummaryGradient()] return [GlobalNormClip(5), SummaryGradient()]
......
...@@ -92,6 +92,7 @@ class Model(ModelDesc): ...@@ -92,6 +92,7 @@ class Model(ModelDesc):
logits, symbolic_functions.flatten(nextinput)) logits, symbolic_functions.flatten(nextinput))
self.cost = tf.reduce_mean(xent_loss, name='cost') self.cost = tf.reduce_mean(xent_loss, name='cost')
summary.add_param_summary([('.*/W', ['histogram'])]) # monitor histogram of all W summary.add_param_summary([('.*/W', ['histogram'])]) # monitor histogram of all W
summary.add_moving_summary(self.cost)
def get_gradient_processor(self): def get_gradient_processor(self):
return [GlobalNormClip(5)] return [GlobalNormClip(5)]
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
import tensorflow.contrib.slim as slim
import os import os
import sys import sys
import argparse import argparse
...@@ -18,6 +19,7 @@ about 0.6% validation error after 30 epochs. ...@@ -18,6 +19,7 @@ about 0.6% validation error after 30 epochs.
from tensorpack import * from tensorpack import *
IMAGE_SIZE = 28 IMAGE_SIZE = 28
USE_SLIM = False
class Model(ModelDesc): class Model(ModelDesc):
...@@ -39,31 +41,46 @@ class Model(ModelDesc): ...@@ -39,31 +41,46 @@ class Model(ModelDesc):
image = tf.expand_dims(image, 3) image = tf.expand_dims(image, 3)
image = image * 2 - 1 # center the pixels values at zero image = image * 2 - 1 # center the pixels values at zero
# The context manager `argscope` sets the default option for all the layers under
# this context. Here we use 32 channel convolution with shape 3x3 and if USE_SLIM:
# PReLU as nonlinearity. is_training = get_current_tower_context().is_training
with argscope(Conv2D, kernel_shape=3, nl=PReLU.f, out_channel=32): with slim.arg_scope([slim.layers.fully_connected],
""" weights_regularizer=slim.l2_regularizer(1e-5)):
LinearWrap is just a convenient way to compose a linear symbolic graph. l = slim.layers.conv2d(image, 32, [3, 3], scope='conv0')
You can also do the equivalent in tensorflow style: l = slim.layers.max_pool2d(l, [2, 2], scope='pool0')
l = Conv2D('conv0', image) l = slim.layers.conv2d(l, 32, [3, 3], padding='SAME', scope='conv1')
l = MaxPooling('pool0', image, 2) l = slim.layers.conv2d(l, 32, [3, 3], scope='conv2')
... """ l = slim.layers.max_pool2d(l, [2, 2], scope='pool1')
l = slim.layers.conv2d(l, 32, [3, 3], scope='conv3')
logits = (LinearWrap(image) # the starting brace is only for line-breaking l = slim.layers.flatten(l, scope='flatten')
.Conv2D('conv0') l = slim.layers.fully_connected(l, 512, scope='fc0')
.MaxPooling('pool0', 2) l = slim.layers.dropout(l, is_training=is_training)
.Conv2D('conv1', padding='SAME') logits = slim.layers.fully_connected(l, 10, activation_fn=None, scope='fc1')
.Conv2D('conv2') else:
.MaxPooling('pool1', 2) # The context manager `argscope` sets the default option for all the layers under
.Conv2D('conv3') # this context. Here we use 32 channel convolution with shape 3x3
.FullyConnected('fc0', 512, nl=tf.nn.relu) with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu, out_channel=32):
.Dropout('dropout', 0.5) """
.FullyConnected('fc1', out_dim=10, nl=tf.identity)()) LinearWrap is just a convenient way to compose a linear symbolic graph.
You can also do the equivalent in tensorflow style:
l = Conv2D('conv0', image)
l = MaxPooling('pool0', l, 2)
... """
logits = (LinearWrap(image) # the starting brace is only for line-breaking
.Conv2D('conv0')
.MaxPooling('pool0', 2)
.Conv2D('conv1', padding='SAME')
.Conv2D('conv2')
.MaxPooling('pool1', 2)
.Conv2D('conv3')
.FullyConnected('fc0', 512, nl=tf.nn.relu)
.Dropout('dropout', 0.5)
.FullyConnected('fc1', out_dim=10, nl=tf.identity)())
prob = tf.nn.softmax(logits, name='prob') # a Bx10 with probabilities prob = tf.nn.softmax(logits, name='prob') # a Bx10 with probabilities
cost = tf.nn.sparse_softmax_cross_entropy_with_logits( # a vector of length B with loss of each sample
logits, label) # a vector of length B with loss of each sample cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label)
cost = tf.reduce_mean(cost, name='cross_entropy_loss') # the average cross-entropy loss cost = tf.reduce_mean(cost, name='cross_entropy_loss') # the average cross-entropy loss
# compute the "incorrect vector", for the callback ClassificationError to use at validation time # compute the "incorrect vector", for the callback ClassificationError to use at validation time
...@@ -76,16 +93,23 @@ class Model(ModelDesc): ...@@ -76,16 +93,23 @@ class Model(ModelDesc):
train_error = tf.reduce_mean(wrong, name='train_error') train_error = tf.reduce_mean(wrong, name='train_error')
summary.add_moving_summary(train_error) summary.add_moving_summary(train_error)
# Use a regex to find parameters to apply weight decay. if not USE_SLIM:
# Here we apply a weight decay on all W (weight matrix) of all fc layers # Use a regex to find parameters to apply weight decay.
wd_cost = tf.mul(1e-5, # Here we apply a weight decay on all W (weight matrix) of all fc layers
regularize_cost('fc.*/W', tf.nn.l2_loss), wd_cost = tf.mul(1e-5,
name='regularize_loss') regularize_cost('fc.*/W', tf.nn.l2_loss),
summary.add_moving_summary(cost, wd_cost) name='regularize_loss')
self.cost = tf.add_n([wd_cost, cost], name='total_cost')
summary.add_moving_summary(cost, wd_cost, self.cost)
else:
# slim already adds regularization to a collection, no extra handling
self.cost = cost
summary.add_moving_summary(cost)
# monitor histogram of all weight (of conv and fc layers) in tensorboard # monitor histogram of all weight (of conv and fc layers) in tensorboard
summary.add_param_summary([('.*/W', ['histogram'])]) summary.add_param_summary([('.*/W', ['histogram', 'rms']),
self.cost = tf.add_n([wd_cost, cost], name='cost') ('.*/weights', ['histogram', 'rms']) # to also work with slim
])
def get_data(): def get_data():
...@@ -122,7 +146,7 @@ def get_config(): ...@@ -122,7 +146,7 @@ def get_config():
InferenceRunner( # run inference(for validation) after every epoch InferenceRunner( # run inference(for validation) after every epoch
dataset_test, # the DataFlow instance used for validation dataset_test, # the DataFlow instance used for validation
# Calculate both the cost and the error for this DataFlow # Calculate both the cost and the error for this DataFlow
[ScalarStats('cost'), ClassificationError('incorrect')]), [ScalarStats('cross_entropy_loss'), ClassificationError('incorrect')]),
]), ]),
model=Model(), model=Model(),
step_per_epoch=step_per_epoch, step_per_epoch=step_per_epoch,
......
...@@ -112,6 +112,7 @@ def BatchNormV2(x, use_local_stat=None, decay=0.9, epsilon=1e-5): ...@@ -112,6 +112,7 @@ def BatchNormV2(x, use_local_stat=None, decay=0.9, epsilon=1e-5):
Note: Note:
* In multi-tower training, only the first training tower maintains a moving average. * In multi-tower training, only the first training tower maintains a moving average.
This is consistent with most frameworks.
* It automatically selects :meth:`BatchNormV1` or :meth:`BatchNormV2` * It automatically selects :meth:`BatchNormV1` or :meth:`BatchNormV2`
according to availability. according to availability.
......
...@@ -11,6 +11,7 @@ import six ...@@ -11,6 +11,7 @@ import six
from ..utils import logger, INPUT_VARS_KEY from ..utils import logger, INPUT_VARS_KEY
from ..tfutils.gradproc import CheckGradient from ..tfutils.gradproc import CheckGradient
from ..tfutils.summary import add_moving_summary
from ..tfutils.tower import get_current_tower_context from ..tfutils.tower import get_current_tower_context
__all__ = ['ModelDesc', 'InputVar', 'ModelFromMetaGraph'] __all__ = ['ModelDesc', 'InputVar', 'ModelFromMetaGraph']
...@@ -113,43 +114,33 @@ Use _build_graph(self, input_vars) and get_current_tower_context().is_training i ...@@ -113,43 +114,33 @@ Use _build_graph(self, input_vars) and get_current_tower_context().is_training i
def get_cost(self): def get_cost(self):
""" """
Return the cost tensor in the graph. Called by some of the :class:`tensorpack.train.Trainer` which Return the cost tensor in the graph. Called by some of the :class:`tensorpack.train.Trainer` which
assumes single-cost models. Apply tfSlim modifications. assumes single-cost models.
"""
# current scope This function also apply tfslim collections to the cost automatically, including
scope = tf.get_variable_scope() ``tf.GraphKeys.REGULARIZATION_LOSSES`` and
``tf.GraphKeys.UPDATE_OPS``. This is because slim users would expect
the regularizer being automatically applied once used in slim layers.
"""
# the model cost so far # the model cost so far
cost = self._get_cost() cost = self._get_cost()
# In contrast to this lib, when using tfSlim the user expect regulization_losses = set(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
# "with slim.arg_scope([...], weights_regularizer=slim.l2_regularizer(0.001)"
# to regularize these layers automatically. Note, this already contains the multiplier!
regulization_losses = 0
# try to prevent regEx error, iff scope name is empty ("")
try:
regulization_losses = set(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope=scope))
except Exception:
regulization_losses = set(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
# TODO: check if "scope=scope" should be used here too
if len(regulization_losses) > 0: if len(regulization_losses) > 0:
cost += tf.add_n(regulization_losses, name="regularize_loss") reg_loss = tf.add_n(list(regulization_losses), name="regularize_loss")
cost = tf.add(reg_loss, cost, name='total_cost')
add_moving_summary(reg_loss, cost)
# As these batch-norm statistics quickly accumulate, there is no significant loss of accuracy # As these batch-norm statistics quickly accumulate, there is no significant loss of accuracy
# if only the main tower handles all batch-normalization updates, which are then shared across # if only the main tower handles all batch-normalization updates, which are then shared across
# the towers # the towers
ctx = get_current_tower_context() ctx = get_current_tower_context()
if ctx is not None and ctx.is_main_training_tower: if ctx is not None and ctx.is_main_training_tower:
# if there is no entry in tf.GraphKeys.UPDATE_OPS, then there is a regEx exception non_grad_updates = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
try:
non_grad_updates = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope=scope))
except Exception:
non_grad_updates = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
if non_grad_updates: if non_grad_updates:
with tf.control_dependencies(non_grad_updates): with tf.control_dependencies(non_grad_updates):
barrier = tf.control_flow_ops.no_op(name='batchnorm_barrier') barrier = tf.control_flow_ops.no_op(name='update_ops_barrier')
cost = tf.control_flow_ops.with_dependencies([barrier], cost) cost = tf.control_flow_ops.with_dependencies([barrier], cost)
return cost return cost
def _get_cost(self, *args): def _get_cost(self, *args):
......
...@@ -116,7 +116,7 @@ def summary_moving_average(tensors=None): ...@@ -116,7 +116,7 @@ def summary_moving_average(tensors=None):
:returns: a op to maintain these average. :returns: a op to maintain these average.
""" """
if tensors is None: if tensors is None:
tensors = tf.get_collection(MOVING_SUMMARY_VARS_KEY) tensors = set(tf.get_collection(MOVING_SUMMARY_VARS_KEY))
# TODO will produce tower0/xxx. not elegant # TODO will produce tower0/xxx. not elegant
with tf.name_scope(None): with tf.name_scope(None):
......
...@@ -9,7 +9,7 @@ from ..utils import logger ...@@ -9,7 +9,7 @@ from ..utils import logger
from ..tfutils import get_global_step_var from ..tfutils import get_global_step_var
from ..tfutils.tower import TowerContext from ..tfutils.tower import TowerContext
from ..tfutils.gradproc import apply_grad_processors from ..tfutils.gradproc import apply_grad_processors
from ..tfutils.summary import summary_moving_average, add_moving_summary from ..tfutils.summary import summary_moving_average
from .input_data import QueueInput, FeedfreeInput from .input_data import QueueInput, FeedfreeInput
from .base import Trainer from .base import Trainer
...@@ -51,7 +51,6 @@ class SingleCostFeedfreeTrainer(FeedfreeTrainerBase): ...@@ -51,7 +51,6 @@ class SingleCostFeedfreeTrainer(FeedfreeTrainerBase):
cost_var, cost_var,
gate_gradients=tf.train.Optimizer.GATE_NONE, gate_gradients=tf.train.Optimizer.GATE_NONE,
colocate_gradients_with_ops=False) colocate_gradients_with_ops=False)
add_moving_summary(cost_var)
return cost_var, grads return cost_var, grads
def run_step(self): def run_step(self):
......
...@@ -9,7 +9,7 @@ from .base import Trainer ...@@ -9,7 +9,7 @@ from .base import Trainer
from ..utils import SUMMARY_BACKUP_KEYS, PREDICT_TOWER from ..utils import SUMMARY_BACKUP_KEYS, PREDICT_TOWER
from ..tfutils import (get_tensors_by_names, freeze_collection, from ..tfutils import (get_tensors_by_names, freeze_collection,
get_global_step_var, TowerContext) get_global_step_var, TowerContext)
from ..tfutils.summary import summary_moving_average, add_moving_summary from ..tfutils.summary import summary_moving_average
from ..predict import OnlinePredictor, build_multi_tower_prediction_graph from ..predict import OnlinePredictor, build_multi_tower_prediction_graph
from ..tfutils.gradproc import apply_grad_processors from ..tfutils.gradproc import apply_grad_processors
from .input_data import FeedInput from .input_data import FeedInput
...@@ -82,7 +82,6 @@ class SimpleTrainer(Trainer): ...@@ -82,7 +82,6 @@ class SimpleTrainer(Trainer):
with TowerContext('', is_training=True): with TowerContext('', is_training=True):
model.build_graph(self.input_vars) model.build_graph(self.input_vars)
cost_var = model.get_cost() cost_var = model.get_cost()
add_moving_summary(cost_var)
grads = self.config.optimizer.compute_gradients(cost_var) grads = self.config.optimizer.compute_gradients(cost_var)
grads = apply_grad_processors(grads, grads = apply_grad_processors(grads,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment