Commit 076a728f authored by Yuxin Wu's avatar Yuxin Wu

some docs update & add optimizer with gradproc

parent 3f91978c
...@@ -19,6 +19,7 @@ Also note that multi-GPU doesn't give you obvious speedup here, ...@@ -19,6 +19,7 @@ Also note that multi-GPU doesn't give you obvious speedup here,
because the bottleneck in this implementation is not computation but data. because the bottleneck in this implementation is not computation but data.
Some practicical notes: Some practicical notes:
1. On machines without huge memory, enabling tcmalloc may keep training throughput more stable. 1. On machines without huge memory, enabling tcmalloc may keep training throughput more stable.
2. Occasionally, processes may not get terminated completely. It is suggested to use `systemd-run` to run any 2. Occasionally, processes may not get terminated completely. It is suggested to use `systemd-run` to run any
multiprocess Python program to get a cgroup dedicated for the task. multiprocess Python program to get a cgroup dedicated for the task.
......
...@@ -133,8 +133,8 @@ class Model(ModelDesc): ...@@ -133,8 +133,8 @@ class Model(ModelDesc):
value_loss, pred_reward, advantage, self.cost) value_loss, pred_reward, advantage, self.cost)
def get_gradient_processor(self): def get_gradient_processor(self):
return [MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1)), return [gradproc.MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1)),
SummaryGradient()] gradproc.SummaryGradient()]
class MySimulatorMaster(SimulatorMaster, Callback): class MySimulatorMaster(SimulatorMaster, Callback):
......
...@@ -74,7 +74,7 @@ class Model(ModelDesc): ...@@ -74,7 +74,7 @@ class Model(ModelDesc):
summary.add_moving_summary(err, self.cost) summary.add_moving_summary(err, self.cost)
def get_gradient_processor(self): def get_gradient_processor(self):
return [GlobalNormClip(5), SummaryGradient()] return [gradproc.GlobalNormClip(5), gradproc.SummaryGradient()]
def get_data(path, isTrain, stat_file): def get_data(path, isTrain, stat_file):
......
...@@ -106,7 +106,7 @@ class Model(ModelDesc): ...@@ -106,7 +106,7 @@ class Model(ModelDesc):
summary.add_moving_summary(self.cost) summary.add_moving_summary(self.cost)
def get_gradient_processor(self): def get_gradient_processor(self):
return [GlobalNormClip(5)] return [gradproc.GlobalNormClip(5)]
def get_config(): def get_config():
......
...@@ -150,8 +150,8 @@ class Model(ModelDesc): ...@@ -150,8 +150,8 @@ class Model(ModelDesc):
return tf.group(*ops, name='update_target_network') return tf.group(*ops, name='update_target_network')
def get_gradient_processor(self): def get_gradient_processor(self):
return [MapGradient(lambda grad: tf.clip_by_global_norm([grad], 5)[0][0]), return [gradproc.GlobalNormalClip(10),
SummaryGradient()] gradproc.SummaryGradient()]
def get_config(): def get_config():
......
...@@ -96,17 +96,13 @@ class Model(GANModelDesc): ...@@ -96,17 +96,13 @@ class Model(GANModelDesc):
fake_sample_viz = tf.cast((fake_sample + 1) * 128.0, tf.uint8, name='viz') fake_sample_viz = tf.cast((fake_sample + 1) * 128.0, tf.uint8, name='viz')
tf.summary.image('gen', fake_sample_viz, max_outputs=30) tf.summary.image('gen', fake_sample_viz, max_outputs=30)
# TODO investigate how bn stats should be updated across two discrim # may need to investigate how bn stats should be updated across two discrim
with tf.variable_scope('discrim'): with tf.variable_scope('discrim'):
real_pred, _ = self.discriminator(real_sample) real_pred, _ = self.discriminator(real_sample)
with tf.variable_scope('discrim', reuse=True): with tf.variable_scope('discrim', reuse=True):
fake_pred, dist_param = self.discriminator(fake_sample) fake_pred, dist_param = self.discriminator(fake_sample)
# post-process output vector from discriminator to become valid
# distribution parameters
encoder_activation = self.factors.encoder_activation(dist_param)
""" """
Mutual information between x (i.e. zc in this case) and some Mutual information between x (i.e. zc in this case) and some
information s (the generated samples in this case): information s (the generated samples in this case):
...@@ -130,6 +126,8 @@ class Model(GANModelDesc): ...@@ -130,6 +126,8 @@ class Model(GANModelDesc):
# Adding this term may make the curve less stable because the # Adding this term may make the curve less stable because the
# entropy estimated from the samples is not the true value. # entropy estimated from the samples is not the true value.
# post-process output vector from discriminator to obtain valid distribution parameters
encoder_activation = self.factors.encoder_activation(dist_param)
cond_ents = self.factors.entropy(zc, encoder_activation) cond_ents = self.factors.entropy(zc, encoder_activation)
cond_entropy = tf.add_n(cond_ents, name="total_conditional_entropy") cond_entropy = tf.add_n(cond_ents, name="total_conditional_entropy")
...@@ -139,7 +137,7 @@ class Model(GANModelDesc): ...@@ -139,7 +137,7 @@ class Model(GANModelDesc):
# default GAN objective # default GAN objective
self.build_losses(real_pred, fake_pred) self.build_losses(real_pred, fake_pred)
# subtract mutual information for latent factores (we want to maximize them) # subtract mutual information for latent factors (we want to maximize them)
self.g_loss = tf.subtract(self.g_loss, MI, name='total_g_loss') self.g_loss = tf.subtract(self.g_loss, MI, name='total_g_loss')
self.d_loss = tf.subtract(self.d_loss, MI, name='total_d_loss') self.d_loss = tf.subtract(self.d_loss, MI, name='total_d_loss')
...@@ -150,7 +148,7 @@ class Model(GANModelDesc): ...@@ -150,7 +148,7 @@ class Model(GANModelDesc):
def get_gradient_processor_g(self): def get_gradient_processor_g(self):
# generator learns 5 times faster # generator learns 5 times faster
return [CheckGradient(), ScaleGradient(('.*', 5), log=False)] return [gradproc.ScaleGradient(('.*', 5), log=False)]
def get_data(): def get_data():
......
...@@ -93,7 +93,8 @@ class Model(ModelDesc): ...@@ -93,7 +93,8 @@ class Model(ModelDesc):
add_moving_summary(costs + [wrong, self.cost]) add_moving_summary(costs + [wrong, self.cost])
def get_gradient_processor(self): def get_gradient_processor(self):
return [ScaleGradient([('convfcweight.*', 0.1), ('conv5_.*', 5)])] return [gradproc.ScaleGradient([
('convfcweight.*', 0.1), ('conv5_.*', 5)])]
def get_data(name): def get_data(name):
......
...@@ -101,7 +101,7 @@ class Model(ModelDesc): ...@@ -101,7 +101,7 @@ class Model(ModelDesc):
s[1].h.assign(z)) s[1].h.assign(z))
def get_gradient_processor(self): def get_gradient_processor(self):
return [GlobalNormClip(5)] return [gradproc.GlobalNormClip(5)]
def get_config(): def get_config():
......
...@@ -12,6 +12,10 @@ Models can be [downloaded here](https://goo.gl/6XjK9V). ...@@ -12,6 +12,10 @@ Models can be [downloaded here](https://goo.gl/6XjK9V).
| ResNet 50 | 7.13% | 24.12% | | ResNet 50 | 7.13% | 24.12% |
| ResNet 101 | 6.54% | 22.89% | | ResNet 101 | 6.54% | 22.89% |
```bash
./imagenet-resnet.py --data /path/to/ILSVRC --gpu 0,1,2,3 -d 18
```
![imagenet](imagenet-resnet.png) ![imagenet](imagenet-resnet.png)
## load-resnet.py ## load-resnet.py
......
...@@ -25,6 +25,9 @@ n=5, about 7.1% val error after 67k steps (8.6 step/s) ...@@ -25,6 +25,9 @@ n=5, about 7.1% val error after 67k steps (8.6 step/s)
n=18, about 5.95% val error after 80k steps (2.6 step/s) n=18, about 5.95% val error after 80k steps (2.6 step/s)
n=30: a 182-layer network, about 5.6% val error after 51k steps (1.55 step/s) n=30: a 182-layer network, about 5.6% val error after 51k steps (1.55 step/s)
This model uses the whole training set instead of a train-val split. This model uses the whole training set instead of a train-val split.
To train:
./cifar10-resnet.py --gpu 0,1
""" """
BATCH_SIZE = 128 BATCH_SIZE = 128
......
...@@ -17,11 +17,6 @@ from tensorpack.utils.stats import RatioCounter ...@@ -17,11 +17,6 @@ from tensorpack.utils.stats import RatioCounter
from tensorpack.tfutils.symbolic_functions import * from tensorpack.tfutils.symbolic_functions import *
from tensorpack.tfutils.summary import * from tensorpack.tfutils.summary import *
"""
Training code of Pre-Activation version of ResNet on ImageNet.
It mainly follows the setup in fb.resnet.torch, and get similar performance.
"""
TOTAL_BATCH_SIZE = 256 TOTAL_BATCH_SIZE = 256
INPUT_SHAPE = 224 INPUT_SHAPE = 224
DEPTH = None DEPTH = None
......
...@@ -86,8 +86,8 @@ class Model(ModelDesc): ...@@ -86,8 +86,8 @@ class Model(ModelDesc):
self.cost = tf.add_n([wd_cost, cost], name='cost') self.cost = tf.add_n([wd_cost, cost], name='cost')
def get_gradient_processor(self): def get_gradient_processor(self):
return [MapGradient(lambda grad: tf.clip_by_global_norm([grad], 5)[0][0]), return [gradproc.ScaleGradient(('STN.*', 0.1)),
ScaleGradient(('STN.*', 0.1)), SummaryGradient()] gradproc.SummaryGradient()]
def get_data(isTrain): def get_data(isTrain):
......
...@@ -24,8 +24,10 @@ USE_SLIM = False ...@@ -24,8 +24,10 @@ USE_SLIM = False
class Model(ModelDesc): class Model(ModelDesc):
def _get_inputs(self): def _get_inputs(self):
"""Define all the input variables (with type, shape, name) that'll be """
fed into the graph to produce a cost. """ Define all the inputs (with type, shape, name) that
the graph will need.
"""
return [InputDesc(tf.float32, (None, IMAGE_SIZE, IMAGE_SIZE), 'input'), return [InputDesc(tf.float32, (None, IMAGE_SIZE, IMAGE_SIZE), 'input'),
InputDesc(tf.int32, (None,), 'label')] InputDesc(tf.int32, (None,), 'label')]
......
...@@ -9,9 +9,9 @@ import pickle ...@@ -9,9 +9,9 @@ import pickle
import six import six
from ..utils import logger, INPUTS_KEY from ..utils import logger, INPUTS_KEY
from ..utils.argtools import memoized
from ..tfutils.modelutils import apply_slim_collections
from ..tfutils.gradproc import CheckGradient from ..tfutils.gradproc import CheckGradient
from ..tfutils.summary import add_moving_summary
from ..tfutils.tower import get_current_tower_context
__all__ = ['InputDesc', 'InputVar', 'ModelDesc', 'ModelFromMetaGraph'] __all__ = ['InputDesc', 'InputVar', 'ModelDesc', 'ModelFromMetaGraph']
...@@ -41,8 +41,10 @@ class InputDesc(object): ...@@ -41,8 +41,10 @@ class InputDesc(object):
return pickle.loads(buf) return pickle.loads(buf)
# TODO print warning? class InputVar(InputDesc):
InputVar = InputDesc def __init__(self, *args, **kwargs):
logger.warn("[Deprecated] InputVar was renamed to InputDesc!")
super(InputVar, self).__init__(*args, **kwargs)
@six.add_metaclass(ABCMeta) @six.add_metaclass(ABCMeta)
...@@ -50,6 +52,7 @@ class ModelDesc(object): ...@@ -50,6 +52,7 @@ class ModelDesc(object):
""" Base class for a model description """ """ Base class for a model description """
# inputs: # inputs:
@memoized
def get_reused_placehdrs(self): def get_reused_placehdrs(self):
""" """
Create or return (if already created) raw input TF placeholders in the graph. Create or return (if already created) raw input TF placeholders in the graph.
...@@ -57,11 +60,7 @@ class ModelDesc(object): ...@@ -57,11 +60,7 @@ class ModelDesc(object):
Returns: Returns:
list[tf.Tensor]: the list of input placeholders in the graph. list[tf.Tensor]: the list of input placeholders in the graph.
""" """
if hasattr(self, 'reuse_input_vars'): return self.build_placeholders()
return self.reuse_input_vars
ret = self.build_placeholders()
self.reuse_input_vars = ret
return ret
def get_input_vars(self): def get_input_vars(self):
# this wasn't a public API anyway # this wasn't a public API anyway
...@@ -70,7 +69,7 @@ class ModelDesc(object): ...@@ -70,7 +69,7 @@ class ModelDesc(object):
def build_placeholders(self, prefix=''): def build_placeholders(self, prefix=''):
""" """
For each input, create new placeholders with optional prefix and For each InputDesc, create new placeholders with optional prefix and
return them. Useful when building new towers. return them. Useful when building new towers.
Returns: Returns:
...@@ -105,8 +104,6 @@ class ModelDesc(object): ...@@ -105,8 +104,6 @@ class ModelDesc(object):
def _get_input_vars(self): # keep backward compatibility def _get_input_vars(self): # keep backward compatibility
raise NotImplementedError() raise NotImplementedError()
# build graph:
def build_graph(self, model_inputs): def build_graph(self, model_inputs):
""" """
Build the whole symbolic graph. Build the whole symbolic graph.
...@@ -121,46 +118,35 @@ class ModelDesc(object): ...@@ -121,46 +118,35 @@ class ModelDesc(object):
def _build_graph(self, inputs): def _build_graph(self, inputs):
pass pass
# set cost. Only for single-cost model.
def get_cost(self): def get_cost(self):
""" """
Return the cost tensor in the graph. Called by some of the :class:`tensorpack.train.Trainer` which Return the cost tensor in the graph.
assumes single-cost models. Used by some of the tensorpack :class:`Trainer` which assumes single-cost models.
You can ignore this method if you use your own trainer with more than one cost.
This function also apply tfslim collections to the cost automatically, including It calls :meth:`ModelDesc._get_cost()` which by default returns
``tf.GraphKeys.REGULARIZATION_LOSSES`` and ``self.cost``. You can override :meth:`_get_cost()` if needed.
``tf.GraphKeys.UPDATE_OPS``. This is because slim users would expect
the regularizer being automatically applied once used in slim layers.
"""
# the model cost so far This function also applies tfslim collections to the cost automatically,
including ``tf.GraphKeys.REGULARIZATION_LOSSES`` and ``tf.GraphKeys.UPDATE_OPS``.
This is because slim users would expect the regularizer being automatically applied once used in slim layers.
"""
cost = self._get_cost() cost = self._get_cost()
return apply_slim_collections(cost)
regulization_losses = set(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
if len(regulization_losses) > 0:
reg_loss = tf.add_n(list(regulization_losses), name="regularize_loss")
cost = tf.add(reg_loss, cost, name='total_cost')
add_moving_summary(reg_loss, cost)
# As these batch-norm statistics quickly accumulate, there is no significant loss of accuracy
# if only the main tower handles all batch-normalization updates, which are then shared across
# the towers
ctx = get_current_tower_context()
if ctx is not None and ctx.is_main_training_tower:
non_grad_updates = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
if non_grad_updates:
logger.info("Apply UPDATE_OPS collection on cost.")
with tf.control_dependencies(non_grad_updates):
cost = tf.identity(cost)
return cost
def _get_cost(self, *args): def _get_cost(self, *args):
return self.cost return self.cost
# set optimizer. only for single-optimizer model. @memoized
def get_optimizer(self): def get_optimizer(self):
""" """
Return the optimizer used in the task.
Used by some of the tensorpack :class:`Trainer` which only uses a single optimizer.
You can ignore this method if you use your own trainer with more than one optimizers.
Users of :class:`ModelDesc` will need to implement `_get_optimizer()`,
which will only be called once per each model.
Returns: Returns:
a :class:`tf.train.Optimizer` instance. a :class:`tf.train.Optimizer` instance.
""" """
...@@ -170,7 +156,7 @@ class ModelDesc(object): ...@@ -170,7 +156,7 @@ class ModelDesc(object):
raise NotImplementedError() raise NotImplementedError()
def get_gradient_processor(self): def get_gradient_processor(self):
""" Return a list of :class:`tensorpack.tfutils.GradientProcessor`. """ (Deprecated) Return a list of :class:`tensorpack.tfutils.GradientProcessor`.
They will be executed by the trainer in the given order. They will be executed by the trainer in the given order.
""" """
return [ # SummaryGradient(), return [ # SummaryGradient(),
......
...@@ -19,7 +19,6 @@ def _global_import(name): ...@@ -19,7 +19,6 @@ def _global_import(name):
_TO_IMPORT = set([ _TO_IMPORT = set([
'common', 'common',
'sessinit', 'sessinit',
'gradproc',
'argscope', 'argscope',
'tower' 'tower'
]) ])
......
...@@ -20,7 +20,7 @@ def apply_grad_processors(grads, gradprocs): ...@@ -20,7 +20,7 @@ def apply_grad_processors(grads, gradprocs):
""" """
Args: Args:
grads (list): list of (grad, var). grads (list): list of (grad, var).
gradprocs (list): list of :class:`GradientProcessor` instances. gradprocs (list[GradientProcessor]): gradient processors to apply.
Returns: Returns:
list: list of (grad, var) went through the processors. list: list of (grad, var) went through the processors.
""" """
......
# -*- coding: UTF-8 -*- # -*- coding: UTF-8 -*-
# File: modelutils.py # File: modelutils.py
# Author: Yuxin Wu <ppwwyyxx@gmail.com> # Author: tensorpack contributors
import tensorflow as tf import tensorflow as tf
from termcolor import colored from termcolor import colored
from ..utils import logger from ..utils import logger
from .summary import add_moving_summary
from .tower import get_current_tower_context
__all__ = ['describe_model', 'get_shape_str'] __all__ = ['describe_model', 'get_shape_str', 'apply_slim_collections']
def describe_model(): def describe_model():
...@@ -46,3 +48,36 @@ def get_shape_str(tensors): ...@@ -46,3 +48,36 @@ def get_shape_str(tensors):
assert isinstance(tensors, (tf.Tensor, tf.Variable)), "Not a tensor: {}".format(type(tensors)) assert isinstance(tensors, (tf.Tensor, tf.Variable)), "Not a tensor: {}".format(type(tensors))
shape_str = str(tensors.get_shape().as_list()) shape_str = str(tensors.get_shape().as_list())
return shape_str return shape_str
def apply_slim_collections(cost):
"""
Apply slim collections to the cost, including:
1. adding the cost with the regularizers in ``tf.GraphKeys.REGULARIZATION_LOSSES``.
2. make the cost depend on ``tf.GraphKeys.UPDATE_OPS``.
Args:
cost: a scalar tensor
Return:
a scalar tensor, the cost after applying the collections.
"""
regulization_losses = set(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
if len(regulization_losses) > 0:
logger.info("Applying REGULARIZATION_LOSSES on cost.")
reg_loss = tf.add_n(list(regulization_losses), name="regularize_loss")
cost = tf.add(reg_loss, cost, name='total_cost')
add_moving_summary(reg_loss, cost)
# As these batch-norm statistics quickly accumulate, there is no significant loss of accuracy
# if only the main tower handles all batch-normalization updates, which are then shared across
# the towers
ctx = get_current_tower_context()
if ctx is not None and ctx.is_main_training_tower:
non_grad_updates = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
if non_grad_updates:
logger.info("Applying UPDATE_OPS collection on cost.")
with tf.control_dependencies(non_grad_updates):
cost = tf.identity(cost, name='cost_with_update')
return cost
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: optimizer.py
# Author: Yuxin Wu <ppwwyyxxc@gmail.com>
import tensorflow as tf
from .gradproc import apply_grad_processors as apply_gradproc
class ProxyOptimizer(tf.train.Optimizer):
def __init__(self, opt):
self._opt = opt
def compute_gradients(self, *args, **kwargs):
return self._opt.compute_gradients(*args, **kwargs)
def get_slot(self, *args, **kwargs):
return self._opt.get_slot(*args, **kwargs)
def get_slot_names(self, *args, **kwargs):
return self._opt.get_slot_names(*args, **kwargs)
def apply_gradients(self, *args, **kwargs):
return self._opt.apply_gradients(*args, **kwargs)
def apply_grad_processors(opt, gradprocs):
"""
Wrapper around optimizers to apply gradient processors.
Args:
opt (tf.train.Optimizer):
gradprocs (list[GradientProcessor]): gradient processors to add to the
optimizer.
Returns:
a :class:`tf.train.Optimizer` instance which runs the gradient
processors before updating the variables.
"""
class _ApplyGradientProcessor(ProxyOptimizer):
def __init__(self, opt, gradprocs):
self._gradprocs = gradprocs
super(_ApplyGradientProcessor, self).__init__(opt)
def apply_gradients(self, grads_and_vars,
global_step=None, name=None):
g = apply_gradproc(grads_and_vars, self._gradprocs)
return self._opt.apply_gradients(g, global_step, name)
return _ApplyGradientProcessor(opt, gradprocs)
...@@ -82,10 +82,11 @@ class SyncMultiGPUTrainer(MultiGPUTrainer, ...@@ -82,10 +82,11 @@ class SyncMultiGPUTrainer(MultiGPUTrainer,
Args: Args:
config, input_queue: same as in :class:`QueueInputTrainer`. config, input_queue: same as in :class:`QueueInputTrainer`.
average_cost (bool): average the cost (instead of gradients) from average_cost (bool): average the cost (instead of gradients) from
each tower and did backprop only once. Should no make each tower and did backprop only once. This option should make no
difference mathematically, but may affect speed. difference mathematically, but may affect speed.
""" """
if config.dataflow is not None: if config.dataflow is not None:
# use queueinput by default. May need to avoid this in the future (when more input type is available)
self._input_method = QueueInput(config.dataflow, input_queue) self._input_method = QueueInput(config.dataflow, input_queue)
else: else:
self._input_method = config.data self._input_method = config.data
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment