Commit 4bc0c748 authored by Yuxin Wu's avatar Yuxin Wu

update docs

parent 60f4c6df
......@@ -10,7 +10,7 @@ def train(self):
# start training:
with sess.as_default():
callbacks.before_train()
for epoch in range(epoch_start, epoch_end):
for epoch in range(starting_epoch, max_epoch + 1):
callbacks.before_epoch()
for step in range(steps_per_epoch):
self.run_step() # callbacks.{before,after}_run are hooked with session
......
......@@ -59,7 +59,8 @@ class COCODetection(object):
def __init__(self, basedir, name):
assert name in COCOMeta.INSTANCE_TO_BASEDIR.keys(), name
self.name = name
self._imgdir = os.path.join(basedir, COCOMeta.INSTANCE_TO_BASEDIR[name])
self._imgdir = os.path.realpath(os.path.join(
basedir, COCOMeta.INSTANCE_TO_BASEDIR[name]))
assert os.path.isdir(self._imgdir), self._imgdir
annotation_file = os.path.join(
basedir, 'annotations/instances_{}.json'.format(name))
......
......@@ -381,7 +381,9 @@ if __name__ == '__main__':
model=Model(),
data=QueueInput(get_train_dataflow(add_mask=config.MODE_MASK)),
callbacks=[
PeriodicCallback(
ModelSaver(max_to_keep=10, keep_checkpoint_every_n_hours=1),
every_k_epochs=20),
# linear warmup
ScheduledHyperParamSetter(
'learning_rate', warmup_schedule, interp='linear', step_based=True),
......
......@@ -16,8 +16,8 @@ from tensorpack.utils.argtools import memoized
class GANModelDesc(ModelDescBase):
def collect_variables(self, g_scope='gen', d_scope='discrim'):
"""
Assign self.g_vars to the parameters under scope `g_scope`,
and same with self.d_vars.
Assign `self.g_vars` to the parameters under scope `g_scope`,
and same with `self.d_vars`.
"""
self.g_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, g_scope)
assert self.g_vars
......@@ -25,7 +25,10 @@ class GANModelDesc(ModelDescBase):
assert self.d_vars
def build_losses(self, logits_real, logits_fake):
"""D and G play two-player minimax game with value function V(G,D)
"""
Build standard GAN loss and set `self.g_loss` and `self.d_loss`.
D and G play two-player minimax game with value function V(G,D)
min_G max _D V(D, G) = IE_{x ~ p_data} [log D(x)] + IE_{z ~ p_fake} [log (1 - D(G(z)))]
......@@ -58,6 +61,13 @@ class GANModelDesc(ModelDescBase):
add_moving_summary(self.g_loss, self.d_loss, d_accuracy, g_accuracy)
def _build_graph(self, inputs):
"""
Have to build one tower and set the following attributes:
g_loss, d_loss, g_vars, d_vars.
"""
pass
@memoized
def get_optimizer(self):
return self._get_optimizer()
......@@ -65,6 +75,11 @@ class GANModelDesc(ModelDescBase):
class GANTrainer(TowerTrainer):
def __init__(self, input, model):
"""
Args:
input (InputSource):
model (GANModelDesc):
"""
super(GANTrainer, self).__init__()
assert isinstance(model, GANModelDesc), model
inputs_desc = model.get_inputs_desc()
......@@ -149,6 +164,7 @@ class MultiGPUGANTrainer(TowerTrainer):
def get_cost(*inputs):
model.build_graph(*inputs)
return [model.d_loss, model.g_loss]
self.tower_func = TowerFuncWrapper(get_cost, model.get_inputs_desc())
devices = [LeastLoadedDeviceSetter(d, raw_devices) for d in raw_devices]
cost_list = DataParallelBuilder.build_on_towers(
......
......@@ -39,8 +39,8 @@ class Model(ModelDesc):
bottleneck = functools.partial(resnet_bottleneck, stride_first=True)
# tensorflow with padding=SAME will by default pad [2,3] here.
# but caffe conv with stride will pad [3,3]
image = tf.pad(image, [[0, 0], [3, 3], [3, 3], [0, 0]])
# but caffe conv with stride will pad [3,2]
image = tf.pad(image, [[0, 0], [3, 2], [3, 2], [0, 0]])
image = tf.transpose(image, [0, 3, 1, 2])
with argscope([Conv2D, MaxPooling, GlobalAvgPooling, BatchNorm],
data_format='channels_first'), \
......
......@@ -31,7 +31,8 @@ class MovingAverageSummary(Callback):
def _setup_graph(self):
ops = tf.get_collection(self._collection)
logger.info("Maintain moving average summary of {} tensors.".format(len(ops)))
logger.info("Maintain moving average summary of {} tensors in collection {}.".format(
len(ops), self._collection))
self.ema_op = tf.group(*ops, name='maintain_moving_average_summary')
self._fetch = tf.train.SessionRunArgs(fetches=self.ema_op)
......@@ -47,7 +48,7 @@ class MergeAllSummaries_RunAlone(Callback):
def _setup_graph(self):
size = len(tf.get_collection(self._key))
logger.info("Summarizing collection '{}' of size {}".format(self._key, size))
logger.info("Summarizing collection '{}' of size {}.".format(self._key, size))
self.summary_op = tf.summary.merge_all(self._key)
def _trigger_step(self):
......@@ -68,7 +69,7 @@ class MergeAllSummaries_RunWithOp(Callback):
def _setup_graph(self):
size = len(tf.get_collection(self._key))
logger.info("Summarizing collection '{}' of size {}".format(self._key, size))
logger.info("Summarizing collection '{}' of size {}.".format(self._key, size))
self.summary_op = tf.summary.merge_all(self._key)
if self.summary_op is not None:
self._fetches = tf.train.SessionRunArgs(self.summary_op)
......
......@@ -162,8 +162,9 @@ class PeriodicCallback(EnableCallbackIf):
callback (Callback): a Callback instance.
every_k_steps (int): enable the callback when ``global_step % k == 0``. Set to
None to ignore.
every_k_epochs (int): enable the callback when ``epoch_num % k == 0``. Set to
None to ignore.
every_k_epochs (int): enable the callback when ``epoch_num % k == 0``.
Also enable when the last step finishes (``epoch_num == max_epoch``
and ``local_step == steps_per_epoch - 1``). Set to None to ignore.
every_k_steps and every_k_epochs can be both set, but cannot be both None.
"""
......@@ -179,6 +180,10 @@ class PeriodicCallback(EnableCallbackIf):
return True
if self._epoch_k is not None and self.epoch_num % self._epoch_k == 0:
return True
if self._epoch_k is not None:
if self.local_step == self.trainer.steps_per_epoch - 1 and \
self.epoch_num == self.trainer.max_epoch:
return True
return False
def __str__(self):
......
......@@ -90,7 +90,7 @@ class ModelDescBase(object):
"""
Build the whole symbolic graph.
This is supposed to be the "tower function" when used with :class:`TowerTrainer`.
By default it will call :meth:`_build_graph` with a list of input tensors, for backward-compatibility.
By default it will call :meth:`_build_graph` with a list of input tensors.
Args:
args ([tf.Tensor]): tensors that matches the list of
......
......@@ -72,7 +72,7 @@ class FilterNoneGrad(GradientProcessor):
g.append((grad, var))
if self._verbose and len(to_print):
message = ', '.join(to_print)
logger.warn("No gradient w.r.t these trainable variables: {}".format(message))
logger.warn("No gradient w.r.t {} trainable variables: {}".format(len(to_print), message))
return g
......
......@@ -98,7 +98,7 @@ class SaverRestore(SessionInit):
"""
Args:
model_path (str): a model name (model-xxxx) or a ``checkpoint`` file.
prefix (str): during restore, add a ``prefix/`` for every variable in this checkpoint
prefix (str): during restore, add a ``prefix/`` for every variable in this checkpoint.
ignore (list[str]): list of tensor names that should be ignored during loading, e.g. learning-rate
"""
if model_path.endswith('.npy') or model_path.endswith('.npz'):
......@@ -134,7 +134,7 @@ class SaverRestore(SessionInit):
for v in graph_vars:
name = get_savename_from_varname(v.name, varname_prefix=self.prefix)
if name in self.ignore and reader.has_tensor(name):
logger.info("Variable {} in the graph will be not loaded from the checkpoint!".format(name))
logger.info("Variable {} in the graph will not be loaded from the checkpoint!".format(name))
else:
if reader.has_tensor(name):
func(reader, name, v)
......
......@@ -13,9 +13,7 @@ from ..utils import logger
from .common import get_op_tensor_name
__all__ = ['SessionUpdate', 'dump_session_params', 'dump_chkpt_vars',
'load_chkpt_vars',
# 'get_savename_from_varname', 'is_training_name',
'get_checkpoint_path']
'load_chkpt_vars', 'get_checkpoint_path']
def get_savename_from_varname(
......
......@@ -48,28 +48,24 @@ def apply_default_prefetch(input_source_or_dataflow, trainer):
def launch_train_with_config(config, trainer):
"""
Train with a :class:`TrainConfig` and a :class:`Trainer`, to
mimic the old training interface. It basically does the following
3 things (and you can easily do them by yourself):
present a simple training interface. It basically does the following
3 things (and you can easily do them by yourself if you need more control):
1. Setup the :class:`InputSource` with automatic prefetching,
for `config.data` or `config.dataflow`.
2. Call `trainer.setup_graph` with the :class:`InputSource`,
as well as `config.model`.
1. Setup the input with automatic prefetching,
from `config.data` or `config.dataflow`.
2. Call `trainer.setup_graph` with the input as well as `config.model`.
3. Call `trainer.train` with rest of the attributes of config.
Args:
config (TrainConfig):
trainer (Trainer): an instance of a SingleCostTrainer
trainer (Trainer): an instance of :class:`SingleCostTrainer`.
Examples:
.. code-block:: python
# With the old trainer:
SyncMultiGPUTrainerParameterServer(config, ps_device='gpu').train()
# With the current version of trainer:
launch_train_with_config(
config, SyncMultiGPUTrainerParameterServer(towers, ps_device='gpu'))
config, SyncMultiGPUTrainerParameterServer(8, ps_device='gpu'))
"""
assert isinstance(trainer, SingleCostTrainer), trainer
assert isinstance(config, TrainConfig), config
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment