Let queue_size summary be part of QueueInput callback

c7b5a85f · Yuxin Wu · a6aa8bce · c7b5a85f · c7b5a85f · c7b5a85f
Commit c7b5a85f authored Aug 07, 2017 by Yuxin Wu
7 changed files
--- a/tensorpack/callbacks/graph.py
+++ b/tensorpack/callbacks/graph.py
@@ -56,7 +56,7 @@ class RunOp(Callback):
    def _before_run(self, _):
        if self.run_step:
            self._print()
-            return self._fetch  # faster than return [self._op]
+            return self._fetch

    def _print(self):
        if self.verbose:

--- a/tensorpack/callbacks/inference_runner.py
+++ b/tensorpack/callbacks/inference_runner.py
@@ -16,7 +16,7 @@ from six.moves import range
 from ..utils import logger
 from ..utils.utils import get_tqdm_kwargs
 from ..utils.develop import deprecated
-from ..dataflow import DataFlow
+from ..dataflow.base import DataFlow, DataFlowTerminated

 from ..graph_builder.input_source_base import InputSource
 from ..graph_builder.input_source import (
@@ -79,13 +79,13 @@ class InferenceRunnerBase(Callback):
        tower_id = self.trainer.config.predict_tower[0]
        device = '/gpu:{}'.format(tower_id) if tower_id >= 0 else '/cpu:0'

-        cbs = self._input_source.setup(self.trainer.model.get_inputs_desc())
+        self._input_callbacks = self._input_source.setup(self.trainer.model.get_inputs_desc())
        with tf.variable_scope(tf.get_variable_scope(), reuse=True):
            self._tower_handle = self.trainer.predictor_factory.build(
                self._tower_name, device, self._input_source)

        self._hooks = [self._build_hook(inf) for inf in self.infs]
-        self._hooks.extend([CallbackToHook(cb) for cb in cbs])
+        self._hooks.extend([CallbackToHook(cb) for cb in self._input_callbacks])

        for inf in self.infs:
            inf.setup_graph(self.trainer)
@@ -108,8 +108,8 @@ class InferenceRunnerBase(Callback):
        try:
            for _ in tqdm.trange(self._size, **get_tqdm_kwargs()):
                self._hooked_sess.run(fetches=[])
-        except StopIteration:
-            raise RuntimeError(
+        except (StopIteration, DataFlowTerminated):
+            logger.exception(
                "[InferenceRunner] input stopped before reaching its size()! " + msg)
        for inf in self.infs:
            inf.trigger_epoch()

--- a/tensorpack/callbacks/steps.py
+++ b/tensorpack/callbacks/steps.py
@@ -18,9 +18,9 @@ __all__ = ['StepTensorPrinter', 'ProgressBar']


 class StepTensorPrinter(Callback):
-    """ It prints the value of some tensors in each step.
-    It's just a demo of how trigger_step works but you should in general use
-    :func:`symbolic_functions.print_stat` or :func:`tf.Print` instead. """
+    """ Prints the value of some tensors in each step.
+        It's an example of how ``before_run/after_run`` works.
+    """

    def __init__(self, names):
        """
@@ -31,7 +31,7 @@ class StepTensorPrinter(Callback):
        logger.warn("Using print_stat or tf.Print in the graph is much faster than StepTensorPrinter!")
        self._names = names

-    def _before_train(self):
+    def _setup_graph(self):
        self._fetches = get_op_or_tensor_by_name(self._names)

    def _before_run(self, _):

--- a/tensorpack/graph_builder/input_source.py
+++ b/tensorpack/graph_builder/input_source.py
@@ -20,6 +20,7 @@ from ..tfutils.tower import get_current_tower_context
 from ..utils import logger
 from ..utils.concurrency import ShareSessionThread
 from ..callbacks.base import Callback
+from ..callbacks.graph import RunOp

 __all__ = ['PlaceholderInput', 'FeedInput', 'DataParallelFeedInput',
           'FeedfreeInput',
@@ -178,9 +179,6 @@ class EnqueueThread(ShareSessionThread):

        self.op = self.queue.enqueue(self.placehdrs)
        self.close_op = self.queue.close(cancel_pending_enqueues=True)
-        self.size_op = self.queue.size()
-        add_moving_summary(tf.cast(
-            self.size_op, tf.float32, name='queue_size'))

    def run(self):
        with self.default_sess():
@@ -235,11 +233,22 @@ class QueueInput(FeedfreeInput):
                    name='input_queue')
            self.thread = EnqueueThread(self.queue, self.ds, self._input_placehdrs)

+    def _create_ema_callback(self):
+        with self.cached_name_scope():
+            # in TF there is no API to get queue capacity, so we can only summary the size
+            size = tf.cast(self.queue.size(), tf.float32, name='queue_size')
+        size_ema_op = add_moving_summary(size, collection=None)[0].op
+        return RunOp(
+            lambda: size_ema_op,
+            run_before=False,
+            run_as_trigger=False,
+            run_step=True)
+
    def _get_callbacks(self):
        from ..callbacks.concurrency import StartProcOrThread
        cb = StartProcOrThread(self.thread)
        cb.chief_only = False
-        return [cb]
+        return [cb, self._create_ema_callback()]

    def _get_input_tensors(self):
        with tf.device('/cpu:0'), self.cached_name_scope():

--- a/tensorpack/tfutils/gradproc.py
+++ b/tensorpack/tfutils/gradproc.py
@@ -141,7 +141,8 @@ class MapGradient(GradientProcessor):
 _summaried_gradient = set()


-# TODO let the maintain op depend on grad directly ?
+# TODO has dependency problems: sess.run may not depend on grad
+# maybe group maintain op and grad ?
 class SummaryGradient(MapGradient):
    """
    Summary histogram and RMS for each gradient variable.

--- a/tensorpack/tfutils/summary.py
+++ b/tensorpack/tfutils/summary.py
@@ -165,15 +165,20 @@ def add_moving_summary(*args, **kwargs):
    Args:
        args: tensors to summary
        decay (float): the decay rate. Defaults to 0.95.
-        collection (str): the name of the collection to add EMA-maintaining ops.
+        collection (str or None): the name of the collection to add EMA-maintaining ops.
            The default will work together with the default
            :class:`MovingAverageSummary` callback.
+
+    Returns:
+        [tf.Tensor]: list of tensors returned by assign_moving_average,
+            which can be used to maintain the EMA.
    """
    decay = kwargs.pop('decay', 0.95)
    coll = kwargs.pop('collection', MOVING_SUMMARY_OPS_KEY)
    assert len(kwargs) == 0, "Unknown arguments: " + str(kwargs)

    ctx = get_current_tower_context()
+    # allow ctx to be none
    if ctx is not None and not ctx.is_main_training_tower:
        return

@@ -188,21 +193,26 @@ def add_moving_summary(*args, **kwargs):
    G = tf.get_default_graph()
    # TODO variable not saved under distributed

+    ema_ops = []
    for c in v:
        name = re.sub('tower[0-9]+/', '', c.op.name)
        with G.colocate_with(c), tf.name_scope(None):
+            # assign_moving_average creates variables with op names, therefore clear ns first.
            with _enter_vs_reuse_ns('EMA') as vs:
-                # will actually create ns EMA_1, EMA_2, etc. tensorflow#6007
                ema_var = tf.get_variable(name, shape=c.shape, dtype=c.dtype,
                                          initializer=tf.constant_initializer(), trainable=False)
                ns = vs.original_name_scope
-            with tf.name_scope(ns):     # reuse VS&NS so that no EMA_1 will appear
+            with tf.name_scope(ns):     # reuse VS&NS so that EMA_1 won't appear
                ema_op = moving_averages.assign_moving_average(
                    ema_var, c, decay,
                    zero_debias=True, name=name + '_EMA_apply')
-            tf.summary.scalar(name + '-summary', ema_op)
-            tf.add_to_collection(coll, ema_op)
+            tf.summary.scalar(name + '-summary', ema_op)    # write the EMA value as a summary
+            ema_ops.append(ema_op)
+    if coll is not None:
+        for op in ema_ops:
            # TODO a new collection to summary every step?
+            tf.add_to_collection(coll, op)
+    return ema_ops


 try:

--- a/tensorpack/train/base.py
+++ b/tensorpack/train/base.py
@@ -34,7 +34,7 @@ class StopTraining(BaseException):
 class MaintainStepCounter(Callback):
    """
    It maintains the global step in the graph, making sure it's increased by one.
-    This callback is always enabled by the trainer, and you wouldn't need to use it.
+    This callback is always enabled by the trainer, you don't need to worry about it.
    """
    def _setup_graph(self):
        # ensure it exists
@@ -64,10 +64,10 @@ class Trainer(object):

    Attributes:
        config (TrainConfig): the config used in this trainer.
-        model (ModelDesc):
+        model (ModelDesc): alias for ``config.model``.
        sess (tf.Session): the current session in use.
        hooked_sess (tf.MonitoredSession): the session with hooks.
-        monitors (Monitors): the monitors. Callbacks can use it for logging.
+        monitors (Monitors): the monitors. Other callbacks can use it for logging.
        local_step (int): the number of (tensorpack) steps that have finished in the current epoch.
    """
    # step attr only available after before_train?