EstimatedTimeLeft callback

43f7ca75 · Yuxin Wu · 4f52bcfd · 43f7ca75 · 43f7ca75 · 43f7ca75
Commit 43f7ca75 authored Mar 04, 2018 by Yuxin Wu
10 changed files
--- a/.travis.yml
+++ b/.travis.yml
@@ -29,7 +29,7 @@ matrix:
    env: TF_VERSION=1.6.0 TF_TYPE=release
  - os: linux
    python: 3.5
-    env: TF_VERSION=1.6.0 TF_TYPE=release
+    env: TF_VERSION=1.6.0 TF_TYPE=release PYPI=true
  - os: linux
    python: 2.7
    env: TF_VERSION=1.head TF_TYPE=nightly
@@ -90,7 +90,7 @@ deploy:
    branch: master
    repo: ppwwyyxx/tensorpack
    python: "3.5"
-    condition: "$TF_TYPE = release"
+    condition: "$PYPI = true"
 - provider: pypi
  server: https://testpypi.python.org/pypi
@@ -103,4 +103,4 @@ deploy:
    branch: test-travis
    repo: ppwwyyxx/tensorpack
    python: "3.5"
-    condition: "$TF_TYPE = release"
+    condition: "$PYPI = true"
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -368,6 +368,7 @@ def autodoc_skip_member(app, what, name, obj, skip, options):
        'GaussianDeform',
        'dump_chkpt_vars',
        'DumpTensor',
+        'DumpParamAsImage',
        'StagingInputWrapper',
        'set_tower_func',
        'TryResumeTraining',

--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -5,4 +5,4 @@ Sphinx>=1.6
 recommonmark==0.4.0
 sphinx_rtd_theme
 mock
-tensorflow
+tensorflow==1.5.0
--- a/docs/tutorial/callback.md
+++ b/docs/tutorial/callback.md
@@ -45,8 +45,10 @@ callbacks=[
             'val-error-top1'),
  # record GPU utilizations during training
  GPUUtilizationTracker(),
-  # Touch a file to pause the training and start a debug shell, to observe what's going on
+  # touch a file to pause the training and start a debug shell, to observe what's going on
-  InjectShell(shell='ipython')
+  InjectShell(shell='ipython'),
+	# estimate time until completion
+	EstimatedTimeLeft()
 ] + [    # these callbacks are enabled by default already, though you can customize them
  # maintain those moving average summaries defined in the model (e.g. training loss, training error)
  MovingAverageSummary(),

--- a/examples/FasterRCNN/train.py
+++ b/examples/FasterRCNN/train.py
@@ -298,8 +298,7 @@ def predict(pred_func, input_file):
 class EvalCallback(Callback):
    def _setup_graph(self):
        self.pred = self.trainer.get_predictor(
-            ['image'],
+            ['image'], get_model_output_names())
-            get_model_output_names())
        self.df = get_eval_dataflow()
    def _before_train(self):
@@ -389,6 +388,7 @@ if __name__ == '__main__':
                ScheduledHyperParamSetter('learning_rate', lr_schedule),
                EvalCallback(),
                GPUUtilizationTracker(),
+                EstimatedTimeLeft(),
            ],
            steps_per_epoch=stepnum,
            max_epoch=config.LR_SCHEDULE[2] * factor // stepnum,

--- a/examples/ResNet/imagenet-resnet.py
+++ b/examples/ResNet/imagenet-resnet.py
@@ -77,6 +77,7 @@ def get_config(model, fake=False):
        BASE_LR = 0.1 * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
+            EstimatedTimeLeft(),
            ScheduledHyperParamSetter(
                'learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2),
                                  (85, BASE_LR * 1e-3), (95, BASE_LR * 1e-4), (105, BASE_LR * 1e-5)]),

--- a/tensorpack/callbacks/__init__.py
+++ b/tensorpack/callbacks/__init__.py
@@ -17,7 +17,7 @@ if STATICA_HACK:
    from .param import *
    from .prof import *
    from .saver import *
-    from .stats import *
+    from .misc import *
    from .steps import *
    from .summary import *
    from .trigger import *

--- a/tensorpack/callbacks/graph.py
+++ b/tensorpack/callbacks/graph.py
@@ -8,12 +8,14 @@
 import tensorflow as tf
 import os
 import numpy as np
+from six.moves import zip
 from ..utils import logger
 from .base import Callback
-from six.moves import zip
+from ..tfutils.common import get_op_tensor_name
-__all__ = ['RunOp', 'RunUpdateOps', 'ProcessTensors', 'DumpTensors', 'DumpTensor']
+__all__ = ['RunOp', 'RunUpdateOps', 'ProcessTensors', 'DumpTensors',
+           'DumpTensor', 'DumpTensorAsImage', 'DumpParamAsImage']
 class RunOp(Callback):
@@ -158,4 +160,64 @@ class DumpTensors(ProcessTensors):
        super(DumpTensors, self).__init__(names, fn)
+class DumpTensorAsImage(Callback):
+    """
+    Dump a tensor to image(s) to ``logger.get_logger_dir()`` once triggered.
+    Note that it requires the tensor is directly evaluable, i.e. either inputs
+    are not its dependency (e.g. the weights of the model), or the inputs are
+    feedfree (in which case this callback will take an extra datapoint from the input pipeline).
+    """
+    def __init__(self, tensor_name, prefix=None, map_func=None, scale=255):
+        """
+        Args:
+            tensor_name (str): the name of the tensor.
+            prefix (str): the filename prefix for saved images. Defaults to the Op name.
+            map_func: map the value of the tensor to an image or list of
+                 images of shape [h, w] or [h, w, c]. If None, will use identity.
+            scale (float): a multiplier on pixel values, applied after map_func.
+        """
+        op_name, self.tensor_name = get_op_tensor_name(tensor_name)
+        self.func = map_func
+        if prefix is None:
+            self.prefix = op_name
+        else:
+            self.prefix = prefix
+        self.log_dir = logger.get_logger_dir()
+        self.scale = scale
+    def _before_train(self):
+        self._tensor = self.graph.get_tensor_by_name(self.tensor_name)
+    def _trigger(self):
+        val = self.trainer.sess.run(self._tensor)
+        if self.func is not None:
+            val = self.func(val)
+        if isinstance(val, list) or val.ndim == 4:
+            for idx, im in enumerate(val):
+                self._dump_image(im, idx)
+        else:
+            self._dump_image(val)
+        self.trainer.monitors.put_image(self.prefix, val)
+    def _dump_image(self, im, idx=None):
+        assert im.ndim in [2, 3], str(im.ndim)
+        fname = os.path.join(
+            self.log_dir,
+            self.prefix + '-ep{:03d}{}.png'.format(
+                self.epoch_num, '-' + str(idx) if idx else ''))
+        res = im * self.scale
+        res = np.clip(res, 0, 255)
+        cv2.imwrite(fname, res.astype('uint8'))
+try:
+    import cv2
+except ImportError:
+    from ..utils.develop import create_dummy_class
+    DumpTensorAsImage = create_dummy_class('DumpTensorAsImage', 'cv2')  # noqa
+# alias
+DumpParamAsImage = DumpTensorAsImage
 DumpTensor = DumpTensors
--- a/tensorpack/callbacks/misc.py
+++ b/tensorpack/callbacks/misc.py
+# -*- coding: utf-8 -*-
+# File: misc.py
+import os
+import time
+from collections import deque
+import numpy as np
+from .base import Callback
+from ..utils.utils import humanize_time_delta
+from ..utils import logger
+__all__ = ['SendStat', 'InjectShell', 'EstimatedTimeLeft']
+class SendStat(Callback):
+    """ An equivalent of :class:`SendMonitorData`, but as a normal callback. """
+    def __init__(self, command, names):
+        self.command = command
+        if not isinstance(names, list):
+            names = [names]
+        self.names = names
+    def _trigger(self):
+        M = self.trainer.monitors
+        v = {k: M.get_latest(k) for k in self.names}
+        cmd = self.command.format(**v)
+        ret = os.system(cmd)
+        if ret != 0:
+            logger.error("Command {} failed with ret={}!".format(cmd, ret))
+class InjectShell(Callback):
+    """
+    Allow users to create a specific file as a signal to pause
+    and iteratively debug the training.
+    Once triggered, it detects whether the file exists, and opens an
+    IPython/pdb shell if yes.
+    In the shell, `self` is this callback, `self.trainer` is the trainer, and
+    from that you can access everything else.
+    """
+    def __init__(self, file='INJECT_SHELL.tmp', shell='ipython'):
+        """
+        Args:
+           file (str): if this file exists, will open a shell.
+           shell (str): one of 'ipython', 'pdb'
+        """
+        self._file = file
+        assert shell in ['ipython', 'pdb']
+        self._shell = shell
+        logger.info("Create a file '{}' to open {} shell.".format(file, shell))
+    def _trigger(self):
+        if os.path.isfile(self._file):
+            logger.info("File {} exists, entering shell.".format(self._file))
+            self._inject()
+    def _inject(self):
+        trainer = self.trainer   # noqa
+        if self._shell == 'ipython':
+            import IPython as IP    # noqa
+            IP.embed()
+        elif self._shell == 'pdb':
+            import pdb   # noqa
+            pdb.set_trace()
+    def _after_train(self):
+        if os.path.isfile(self._file):
+            os.unlink(self._file)
+class EstimatedTimeLeft(Callback):
+    """
+    Estimate the time left until completion of training.
+    """
+    def __init__(self, last_k_epochs=5):
+        """
+        Args:
+            last_k_epochs (int): Use the time spent on last k epochs to
+                estimate total time left.
+        """
+        self._times = deque(maxlen=last_k_epochs)
+    def _before_train(self):
+        self._max_epoch = self.trainer.max_epoch
+        self._last_time = time.time()
+    def _trigger_epoch(self):
+        duration = time.time() - self._last_time
+        self._last_time = time.time()
+        self._times.append(duration)
+        average_epoch_time = np.mean(self._times)
+        time_left = (self._max_epoch - self.epoch_num) * average_epoch_time
+        logger.info("Estimated Time Left: " + humanize_time_delta(time_left))
--- a/tensorpack/callbacks/stats.py
+++ b/tensorpack/callbacks/stats.py
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # File: stats.py
+# for compatibility only
+from .misc import InjectShell, SendStat     # noqa
+from .graph import DumpParamAsImage     # noqa
-import os
+__all__ = []
-import numpy as np
-from .base import Callback
-from ..utils import logger
-from ..tfutils.common import get_op_tensor_name
-__all__ = ['SendStat', 'DumpParamAsImage', 'InjectShell']
-class SendStat(Callback):
-    """ An equivalent of :class:`SendMonitorData`, but as a normal callback. """
-    def __init__(self, command, names):
-        self.command = command
-        if not isinstance(names, list):
-            names = [names]
-        self.names = names
-    def _trigger(self):
-        M = self.trainer.monitors
-        v = {k: M.get_latest(k) for k in self.names}
-        cmd = self.command.format(**v)
-        ret = os.system(cmd)
-        if ret != 0:
-            logger.error("Command {} failed with ret={}!".format(cmd, ret))
-class InjectShell(Callback):
-    """
-    Allow users to create a specific file as a signal to pause
-    and iteratively debug the training.
-    Once triggered, it detects whether the file exists, and opens an
-    IPython/pdb shell if yes.
-    In the shell, `self` is this callback, `self.trainer` is the trainer, and
-    from that you can access everything else.
-    """
-    def __init__(self, file='INJECT_SHELL.tmp', shell='ipython'):
-        """
-        Args:
-           file (str): if this file exists, will open a shell.
-           shell (str): one of 'ipython', 'pdb'
-        """
-        self._file = file
-        assert shell in ['ipython', 'pdb']
-        self._shell = shell
-        logger.info("Create a file '{}' to open {} shell.".format(file, shell))
-    def _trigger(self):
-        if os.path.isfile(self._file):
-            logger.info("File {} exists, entering shell.".format(self._file))
-            self._inject()
-    def _inject(self):
-        trainer = self.trainer   # noqa
-        if self._shell == 'ipython':
-            import IPython as IP    # noqa
-            IP.embed()
-        elif self._shell == 'pdb':
-            import pdb   # noqa
-            pdb.set_trace()
-    def _after_train(self):
-        if os.path.isfile(self._file):
-            os.unlink(self._file)
-class DumpParamAsImage(Callback):
-    """
-    Dump a tensor to image(s) to ``logger.get_logger_dir()`` once triggered.
-    Note that it requires the tensor is directly evaluable, i.e. either inputs
-    are not its dependency (e.g. the weights of the model), or the inputs are
-    feedfree (in which case this callback will take an extra datapoint from the input pipeline).
-    """
-    def __init__(self, tensor_name, prefix=None, map_func=None, scale=255):
-        """
-        Args:
-            tensor_name (str): the name of the tensor.
-            prefix (str): the filename prefix for saved images. Defaults to the Op name.
-            map_func: map the value of the tensor to an image or list of
-                 images of shape [h, w] or [h, w, c]. If None, will use identity.
-            scale (float): a multiplier on pixel values, applied after map_func.
-        """
-        op_name, self.tensor_name = get_op_tensor_name(tensor_name)
-        self.func = map_func
-        if prefix is None:
-            self.prefix = op_name
-        else:
-            self.prefix = prefix
-        self.log_dir = logger.get_logger_dir()
-        self.scale = scale
-    def _before_train(self):
-        self._tensor = self.graph.get_tensor_by_name(self.tensor_name)
-    def _trigger(self):
-        val = self.trainer.sess.run(self._tensor)
-        if self.func is not None:
-            val = self.func(val)
-        if isinstance(val, list) or val.ndim == 4:
-            for idx, im in enumerate(val):
-                self._dump_image(im, idx)
-        else:
-            self._dump_image(val)
-        self.trainer.monitors.put_image(self.prefix, val)
-    def _dump_image(self, im, idx=None):
-        assert im.ndim in [2, 3], str(im.ndim)
-        fname = os.path.join(
-            self.log_dir,
-            self.prefix + '-ep{:03d}{}.png'.format(
-                self.epoch_num, '-' + str(idx) if idx else ''))
-        res = im * self.scale
-        res = np.clip(res, 0, 255)
-        cv2.imwrite(fname, res.astype('uint8'))
-try:
-    import cv2
-except ImportError:
-    from ..utils.develop import create_dummy_class
-    DumpParamAsImage = create_dummy_class('DumpParamAsImage', 'cv2')  # noqa