Commit 076a728f authored by Yuxin Wu's avatar Yuxin Wu

some docs update & add optimizer with gradproc

parent 3f91978c
......@@ -19,6 +19,7 @@ Also note that multi-GPU doesn't give you obvious speedup here,
because the bottleneck in this implementation is not computation but data.
Some practicical notes:
1. On machines without huge memory, enabling tcmalloc may keep training throughput more stable.
2. Occasionally, processes may not get terminated completely. It is suggested to use `systemd-run` to run any
multiprocess Python program to get a cgroup dedicated for the task.
......
......@@ -133,8 +133,8 @@ class Model(ModelDesc):
value_loss, pred_reward, advantage, self.cost)
def get_gradient_processor(self):
return [MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1)),
SummaryGradient()]
return [gradproc.MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1)),
gradproc.SummaryGradient()]
class MySimulatorMaster(SimulatorMaster, Callback):
......
......@@ -74,7 +74,7 @@ class Model(ModelDesc):
summary.add_moving_summary(err, self.cost)
def get_gradient_processor(self):
return [GlobalNormClip(5), SummaryGradient()]
return [gradproc.GlobalNormClip(5), gradproc.SummaryGradient()]
def get_data(path, isTrain, stat_file):
......
......@@ -106,7 +106,7 @@ class Model(ModelDesc):
summary.add_moving_summary(self.cost)
def get_gradient_processor(self):
return [GlobalNormClip(5)]
return [gradproc.GlobalNormClip(5)]
def get_config():
......
......@@ -150,8 +150,8 @@ class Model(ModelDesc):
return tf.group(*ops, name='update_target_network')
def get_gradient_processor(self):
return [MapGradient(lambda grad: tf.clip_by_global_norm([grad], 5)[0][0]),
SummaryGradient()]
return [gradproc.GlobalNormalClip(10),
gradproc.SummaryGradient()]
def get_config():
......
......@@ -96,17 +96,13 @@ class Model(GANModelDesc):
fake_sample_viz = tf.cast((fake_sample + 1) * 128.0, tf.uint8, name='viz')
tf.summary.image('gen', fake_sample_viz, max_outputs=30)
# TODO investigate how bn stats should be updated across two discrim
# may need to investigate how bn stats should be updated across two discrim
with tf.variable_scope('discrim'):
real_pred, _ = self.discriminator(real_sample)
with tf.variable_scope('discrim', reuse=True):
fake_pred, dist_param = self.discriminator(fake_sample)
# post-process output vector from discriminator to become valid
# distribution parameters
encoder_activation = self.factors.encoder_activation(dist_param)
"""
Mutual information between x (i.e. zc in this case) and some
information s (the generated samples in this case):
......@@ -130,6 +126,8 @@ class Model(GANModelDesc):
# Adding this term may make the curve less stable because the
# entropy estimated from the samples is not the true value.
# post-process output vector from discriminator to obtain valid distribution parameters
encoder_activation = self.factors.encoder_activation(dist_param)
cond_ents = self.factors.entropy(zc, encoder_activation)
cond_entropy = tf.add_n(cond_ents, name="total_conditional_entropy")
......@@ -139,7 +137,7 @@ class Model(GANModelDesc):
# default GAN objective
self.build_losses(real_pred, fake_pred)
# subtract mutual information for latent factores (we want to maximize them)
# subtract mutual information for latent factors (we want to maximize them)
self.g_loss = tf.subtract(self.g_loss, MI, name='total_g_loss')
self.d_loss = tf.subtract(self.d_loss, MI, name='total_d_loss')
......@@ -150,7 +148,7 @@ class Model(GANModelDesc):
def get_gradient_processor_g(self):
# generator learns 5 times faster
return [CheckGradient(), ScaleGradient(('.*', 5), log=False)]
return [gradproc.ScaleGradient(('.*', 5), log=False)]
def get_data():
......
......@@ -93,7 +93,8 @@ class Model(ModelDesc):
add_moving_summary(costs + [wrong, self.cost])
def get_gradient_processor(self):
return [ScaleGradient([('convfcweight.*', 0.1), ('conv5_.*', 5)])]
return [gradproc.ScaleGradient([
('convfcweight.*', 0.1), ('conv5_.*', 5)])]
def get_data(name):
......
......@@ -101,7 +101,7 @@ class Model(ModelDesc):
s[1].h.assign(z))
def get_gradient_processor(self):
return [GlobalNormClip(5)]
return [gradproc.GlobalNormClip(5)]
def get_config():
......
......@@ -12,6 +12,10 @@ Models can be [downloaded here](https://goo.gl/6XjK9V).
| ResNet 50 | 7.13% | 24.12% |
| ResNet 101 | 6.54% | 22.89% |
```bash
./imagenet-resnet.py --data /path/to/ILSVRC --gpu 0,1,2,3 -d 18
```
![imagenet](imagenet-resnet.png)
## load-resnet.py
......
......@@ -25,6 +25,9 @@ n=5, about 7.1% val error after 67k steps (8.6 step/s)
n=18, about 5.95% val error after 80k steps (2.6 step/s)
n=30: a 182-layer network, about 5.6% val error after 51k steps (1.55 step/s)
This model uses the whole training set instead of a train-val split.
To train:
./cifar10-resnet.py --gpu 0,1
"""
BATCH_SIZE = 128
......
......@@ -17,11 +17,6 @@ from tensorpack.utils.stats import RatioCounter
from tensorpack.tfutils.symbolic_functions import *
from tensorpack.tfutils.summary import *
"""
Training code of Pre-Activation version of ResNet on ImageNet.
It mainly follows the setup in fb.resnet.torch, and get similar performance.
"""
TOTAL_BATCH_SIZE = 256
INPUT_SHAPE = 224
DEPTH = None
......
......@@ -86,8 +86,8 @@ class Model(ModelDesc):
self.cost = tf.add_n([wd_cost, cost], name='cost')
def get_gradient_processor(self):
return [MapGradient(lambda grad: tf.clip_by_global_norm([grad], 5)[0][0]),
ScaleGradient(('STN.*', 0.1)), SummaryGradient()]
return [gradproc.ScaleGradient(('STN.*', 0.1)),
gradproc.SummaryGradient()]
def get_data(isTrain):
......
......@@ -24,8 +24,10 @@ USE_SLIM = False
class Model(ModelDesc):
def _get_inputs(self):
"""Define all the input variables (with type, shape, name) that'll be
fed into the graph to produce a cost. """
"""
Define all the inputs (with type, shape, name) that
the graph will need.
"""
return [InputDesc(tf.float32, (None, IMAGE_SIZE, IMAGE_SIZE), 'input'),
InputDesc(tf.int32, (None,), 'label')]
......
......@@ -9,9 +9,9 @@ import pickle
import six
from ..utils import logger, INPUTS_KEY
from ..utils.argtools import memoized
from ..tfutils.modelutils import apply_slim_collections
from ..tfutils.gradproc import CheckGradient
from ..tfutils.summary import add_moving_summary
from ..tfutils.tower import get_current_tower_context
__all__ = ['InputDesc', 'InputVar', 'ModelDesc', 'ModelFromMetaGraph']
......@@ -41,8 +41,10 @@ class InputDesc(object):
return pickle.loads(buf)
# TODO print warning?
InputVar = InputDesc
class InputVar(InputDesc):
def __init__(self, *args, **kwargs):
logger.warn("[Deprecated] InputVar was renamed to InputDesc!")
super(InputVar, self).__init__(*args, **kwargs)
@six.add_metaclass(ABCMeta)
......@@ -50,6 +52,7 @@ class ModelDesc(object):
""" Base class for a model description """
# inputs:
@memoized
def get_reused_placehdrs(self):
"""
Create or return (if already created) raw input TF placeholders in the graph.
......@@ -57,11 +60,7 @@ class ModelDesc(object):
Returns:
list[tf.Tensor]: the list of input placeholders in the graph.
"""
if hasattr(self, 'reuse_input_vars'):
return self.reuse_input_vars
ret = self.build_placeholders()
self.reuse_input_vars = ret
return ret
return self.build_placeholders()
def get_input_vars(self):
# this wasn't a public API anyway
......@@ -70,7 +69,7 @@ class ModelDesc(object):
def build_placeholders(self, prefix=''):
"""
For each input, create new placeholders with optional prefix and
For each InputDesc, create new placeholders with optional prefix and
return them. Useful when building new towers.
Returns:
......@@ -105,8 +104,6 @@ class ModelDesc(object):
def _get_input_vars(self): # keep backward compatibility
raise NotImplementedError()
# build graph:
def build_graph(self, model_inputs):
"""
Build the whole symbolic graph.
......@@ -121,46 +118,35 @@ class ModelDesc(object):
def _build_graph(self, inputs):
pass
# set cost. Only for single-cost model.
def get_cost(self):
"""
Return the cost tensor in the graph. Called by some of the :class:`tensorpack.train.Trainer` which
assumes single-cost models.
Return the cost tensor in the graph.
Used by some of the tensorpack :class:`Trainer` which assumes single-cost models.
You can ignore this method if you use your own trainer with more than one cost.
This function also apply tfslim collections to the cost automatically, including
``tf.GraphKeys.REGULARIZATION_LOSSES`` and
``tf.GraphKeys.UPDATE_OPS``. This is because slim users would expect
the regularizer being automatically applied once used in slim layers.
"""
It calls :meth:`ModelDesc._get_cost()` which by default returns
``self.cost``. You can override :meth:`_get_cost()` if needed.
# the model cost so far
This function also applies tfslim collections to the cost automatically,
including ``tf.GraphKeys.REGULARIZATION_LOSSES`` and ``tf.GraphKeys.UPDATE_OPS``.
This is because slim users would expect the regularizer being automatically applied once used in slim layers.
"""
cost = self._get_cost()
regulization_losses = set(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
if len(regulization_losses) > 0:
reg_loss = tf.add_n(list(regulization_losses), name="regularize_loss")
cost = tf.add(reg_loss, cost, name='total_cost')
add_moving_summary(reg_loss, cost)
# As these batch-norm statistics quickly accumulate, there is no significant loss of accuracy
# if only the main tower handles all batch-normalization updates, which are then shared across
# the towers
ctx = get_current_tower_context()
if ctx is not None and ctx.is_main_training_tower:
non_grad_updates = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
if non_grad_updates:
logger.info("Apply UPDATE_OPS collection on cost.")
with tf.control_dependencies(non_grad_updates):
cost = tf.identity(cost)
return cost
return apply_slim_collections(cost)
def _get_cost(self, *args):
return self.cost
# set optimizer. only for single-optimizer model.
@memoized
def get_optimizer(self):
"""
Return the optimizer used in the task.
Used by some of the tensorpack :class:`Trainer` which only uses a single optimizer.
You can ignore this method if you use your own trainer with more than one optimizers.
Users of :class:`ModelDesc` will need to implement `_get_optimizer()`,
which will only be called once per each model.
Returns:
a :class:`tf.train.Optimizer` instance.
"""
......@@ -170,7 +156,7 @@ class ModelDesc(object):
raise NotImplementedError()
def get_gradient_processor(self):
""" Return a list of :class:`tensorpack.tfutils.GradientProcessor`.
""" (Deprecated) Return a list of :class:`tensorpack.tfutils.GradientProcessor`.
They will be executed by the trainer in the given order.
"""
return [ # SummaryGradient(),
......
......@@ -19,7 +19,6 @@ def _global_import(name):
_TO_IMPORT = set([
'common',
'sessinit',
'gradproc',
'argscope',
'tower'
])
......
......@@ -20,7 +20,7 @@ def apply_grad_processors(grads, gradprocs):
"""
Args:
grads (list): list of (grad, var).
gradprocs (list): list of :class:`GradientProcessor` instances.
gradprocs (list[GradientProcessor]): gradient processors to apply.
Returns:
list: list of (grad, var) went through the processors.
"""
......
# -*- coding: UTF-8 -*-
# File: modelutils.py
# Author: Yuxin Wu <ppwwyyxx@gmail.com>
# Author: tensorpack contributors
import tensorflow as tf
from termcolor import colored
from ..utils import logger
from .summary import add_moving_summary
from .tower import get_current_tower_context
__all__ = ['describe_model', 'get_shape_str']
__all__ = ['describe_model', 'get_shape_str', 'apply_slim_collections']
def describe_model():
......@@ -46,3 +48,36 @@ def get_shape_str(tensors):
assert isinstance(tensors, (tf.Tensor, tf.Variable)), "Not a tensor: {}".format(type(tensors))
shape_str = str(tensors.get_shape().as_list())
return shape_str
def apply_slim_collections(cost):
"""
Apply slim collections to the cost, including:
1. adding the cost with the regularizers in ``tf.GraphKeys.REGULARIZATION_LOSSES``.
2. make the cost depend on ``tf.GraphKeys.UPDATE_OPS``.
Args:
cost: a scalar tensor
Return:
a scalar tensor, the cost after applying the collections.
"""
regulization_losses = set(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
if len(regulization_losses) > 0:
logger.info("Applying REGULARIZATION_LOSSES on cost.")
reg_loss = tf.add_n(list(regulization_losses), name="regularize_loss")
cost = tf.add(reg_loss, cost, name='total_cost')
add_moving_summary(reg_loss, cost)
# As these batch-norm statistics quickly accumulate, there is no significant loss of accuracy
# if only the main tower handles all batch-normalization updates, which are then shared across
# the towers
ctx = get_current_tower_context()
if ctx is not None and ctx.is_main_training_tower:
non_grad_updates = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
if non_grad_updates:
logger.info("Applying UPDATE_OPS collection on cost.")
with tf.control_dependencies(non_grad_updates):
cost = tf.identity(cost, name='cost_with_update')
return cost
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: optimizer.py
# Author: Yuxin Wu <ppwwyyxxc@gmail.com>
import tensorflow as tf
from .gradproc import apply_grad_processors as apply_gradproc
class ProxyOptimizer(tf.train.Optimizer):
def __init__(self, opt):
self._opt = opt
def compute_gradients(self, *args, **kwargs):
return self._opt.compute_gradients(*args, **kwargs)
def get_slot(self, *args, **kwargs):
return self._opt.get_slot(*args, **kwargs)
def get_slot_names(self, *args, **kwargs):
return self._opt.get_slot_names(*args, **kwargs)
def apply_gradients(self, *args, **kwargs):
return self._opt.apply_gradients(*args, **kwargs)
def apply_grad_processors(opt, gradprocs):
"""
Wrapper around optimizers to apply gradient processors.
Args:
opt (tf.train.Optimizer):
gradprocs (list[GradientProcessor]): gradient processors to add to the
optimizer.
Returns:
a :class:`tf.train.Optimizer` instance which runs the gradient
processors before updating the variables.
"""
class _ApplyGradientProcessor(ProxyOptimizer):
def __init__(self, opt, gradprocs):
self._gradprocs = gradprocs
super(_ApplyGradientProcessor, self).__init__(opt)
def apply_gradients(self, grads_and_vars,
global_step=None, name=None):
g = apply_gradproc(grads_and_vars, self._gradprocs)
return self._opt.apply_gradients(g, global_step, name)
return _ApplyGradientProcessor(opt, gradprocs)
......@@ -82,10 +82,11 @@ class SyncMultiGPUTrainer(MultiGPUTrainer,
Args:
config, input_queue: same as in :class:`QueueInputTrainer`.
average_cost (bool): average the cost (instead of gradients) from
each tower and did backprop only once. Should no make
each tower and did backprop only once. This option should make no
difference mathematically, but may affect speed.
"""
if config.dataflow is not None:
# use queueinput by default. May need to avoid this in the future (when more input type is available)
self._input_method = QueueInput(config.dataflow, input_queue)
else:
self._input_method = config.data
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment