Commit 43f7ca75 authored by Yuxin Wu's avatar Yuxin Wu

EstimatedTimeLeft callback

parent 4f52bcfd
...@@ -29,7 +29,7 @@ matrix: ...@@ -29,7 +29,7 @@ matrix:
env: TF_VERSION=1.6.0 TF_TYPE=release env: TF_VERSION=1.6.0 TF_TYPE=release
- os: linux - os: linux
python: 3.5 python: 3.5
env: TF_VERSION=1.6.0 TF_TYPE=release env: TF_VERSION=1.6.0 TF_TYPE=release PYPI=true
- os: linux - os: linux
python: 2.7 python: 2.7
env: TF_VERSION=1.head TF_TYPE=nightly env: TF_VERSION=1.head TF_TYPE=nightly
...@@ -90,7 +90,7 @@ deploy: ...@@ -90,7 +90,7 @@ deploy:
branch: master branch: master
repo: ppwwyyxx/tensorpack repo: ppwwyyxx/tensorpack
python: "3.5" python: "3.5"
condition: "$TF_TYPE = release" condition: "$PYPI = true"
- provider: pypi - provider: pypi
server: https://testpypi.python.org/pypi server: https://testpypi.python.org/pypi
...@@ -103,4 +103,4 @@ deploy: ...@@ -103,4 +103,4 @@ deploy:
branch: test-travis branch: test-travis
repo: ppwwyyxx/tensorpack repo: ppwwyyxx/tensorpack
python: "3.5" python: "3.5"
condition: "$TF_TYPE = release" condition: "$PYPI = true"
...@@ -368,6 +368,7 @@ def autodoc_skip_member(app, what, name, obj, skip, options): ...@@ -368,6 +368,7 @@ def autodoc_skip_member(app, what, name, obj, skip, options):
'GaussianDeform', 'GaussianDeform',
'dump_chkpt_vars', 'dump_chkpt_vars',
'DumpTensor', 'DumpTensor',
'DumpParamAsImage',
'StagingInputWrapper', 'StagingInputWrapper',
'set_tower_func', 'set_tower_func',
'TryResumeTraining', 'TryResumeTraining',
......
...@@ -5,4 +5,4 @@ Sphinx>=1.6 ...@@ -5,4 +5,4 @@ Sphinx>=1.6
recommonmark==0.4.0 recommonmark==0.4.0
sphinx_rtd_theme sphinx_rtd_theme
mock mock
tensorflow tensorflow==1.5.0
...@@ -45,8 +45,10 @@ callbacks=[ ...@@ -45,8 +45,10 @@ callbacks=[
'val-error-top1'), 'val-error-top1'),
# record GPU utilizations during training # record GPU utilizations during training
GPUUtilizationTracker(), GPUUtilizationTracker(),
# Touch a file to pause the training and start a debug shell, to observe what's going on # touch a file to pause the training and start a debug shell, to observe what's going on
InjectShell(shell='ipython') InjectShell(shell='ipython'),
# estimate time until completion
EstimatedTimeLeft()
] + [ # these callbacks are enabled by default already, though you can customize them ] + [ # these callbacks are enabled by default already, though you can customize them
# maintain those moving average summaries defined in the model (e.g. training loss, training error) # maintain those moving average summaries defined in the model (e.g. training loss, training error)
MovingAverageSummary(), MovingAverageSummary(),
......
...@@ -298,8 +298,7 @@ def predict(pred_func, input_file): ...@@ -298,8 +298,7 @@ def predict(pred_func, input_file):
class EvalCallback(Callback): class EvalCallback(Callback):
def _setup_graph(self): def _setup_graph(self):
self.pred = self.trainer.get_predictor( self.pred = self.trainer.get_predictor(
['image'], ['image'], get_model_output_names())
get_model_output_names())
self.df = get_eval_dataflow() self.df = get_eval_dataflow()
def _before_train(self): def _before_train(self):
...@@ -389,6 +388,7 @@ if __name__ == '__main__': ...@@ -389,6 +388,7 @@ if __name__ == '__main__':
ScheduledHyperParamSetter('learning_rate', lr_schedule), ScheduledHyperParamSetter('learning_rate', lr_schedule),
EvalCallback(), EvalCallback(),
GPUUtilizationTracker(), GPUUtilizationTracker(),
EstimatedTimeLeft(),
], ],
steps_per_epoch=stepnum, steps_per_epoch=stepnum,
max_epoch=config.LR_SCHEDULE[2] * factor // stepnum, max_epoch=config.LR_SCHEDULE[2] * factor // stepnum,
......
...@@ -77,6 +77,7 @@ def get_config(model, fake=False): ...@@ -77,6 +77,7 @@ def get_config(model, fake=False):
BASE_LR = 0.1 * (args.batch / 256.0) BASE_LR = 0.1 * (args.batch / 256.0)
callbacks = [ callbacks = [
ModelSaver(), ModelSaver(),
EstimatedTimeLeft(),
ScheduledHyperParamSetter( ScheduledHyperParamSetter(
'learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), 'learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2),
(85, BASE_LR * 1e-3), (95, BASE_LR * 1e-4), (105, BASE_LR * 1e-5)]), (85, BASE_LR * 1e-3), (95, BASE_LR * 1e-4), (105, BASE_LR * 1e-5)]),
......
...@@ -17,7 +17,7 @@ if STATICA_HACK: ...@@ -17,7 +17,7 @@ if STATICA_HACK:
from .param import * from .param import *
from .prof import * from .prof import *
from .saver import * from .saver import *
from .stats import * from .misc import *
from .steps import * from .steps import *
from .summary import * from .summary import *
from .trigger import * from .trigger import *
......
...@@ -8,12 +8,14 @@ ...@@ -8,12 +8,14 @@
import tensorflow as tf import tensorflow as tf
import os import os
import numpy as np import numpy as np
from six.moves import zip
from ..utils import logger from ..utils import logger
from .base import Callback from .base import Callback
from six.moves import zip from ..tfutils.common import get_op_tensor_name
__all__ = ['RunOp', 'RunUpdateOps', 'ProcessTensors', 'DumpTensors', 'DumpTensor'] __all__ = ['RunOp', 'RunUpdateOps', 'ProcessTensors', 'DumpTensors',
'DumpTensor', 'DumpTensorAsImage', 'DumpParamAsImage']
class RunOp(Callback): class RunOp(Callback):
...@@ -158,4 +160,64 @@ class DumpTensors(ProcessTensors): ...@@ -158,4 +160,64 @@ class DumpTensors(ProcessTensors):
super(DumpTensors, self).__init__(names, fn) super(DumpTensors, self).__init__(names, fn)
class DumpTensorAsImage(Callback):
"""
Dump a tensor to image(s) to ``logger.get_logger_dir()`` once triggered.
Note that it requires the tensor is directly evaluable, i.e. either inputs
are not its dependency (e.g. the weights of the model), or the inputs are
feedfree (in which case this callback will take an extra datapoint from the input pipeline).
"""
def __init__(self, tensor_name, prefix=None, map_func=None, scale=255):
"""
Args:
tensor_name (str): the name of the tensor.
prefix (str): the filename prefix for saved images. Defaults to the Op name.
map_func: map the value of the tensor to an image or list of
images of shape [h, w] or [h, w, c]. If None, will use identity.
scale (float): a multiplier on pixel values, applied after map_func.
"""
op_name, self.tensor_name = get_op_tensor_name(tensor_name)
self.func = map_func
if prefix is None:
self.prefix = op_name
else:
self.prefix = prefix
self.log_dir = logger.get_logger_dir()
self.scale = scale
def _before_train(self):
self._tensor = self.graph.get_tensor_by_name(self.tensor_name)
def _trigger(self):
val = self.trainer.sess.run(self._tensor)
if self.func is not None:
val = self.func(val)
if isinstance(val, list) or val.ndim == 4:
for idx, im in enumerate(val):
self._dump_image(im, idx)
else:
self._dump_image(val)
self.trainer.monitors.put_image(self.prefix, val)
def _dump_image(self, im, idx=None):
assert im.ndim in [2, 3], str(im.ndim)
fname = os.path.join(
self.log_dir,
self.prefix + '-ep{:03d}{}.png'.format(
self.epoch_num, '-' + str(idx) if idx else ''))
res = im * self.scale
res = np.clip(res, 0, 255)
cv2.imwrite(fname, res.astype('uint8'))
try:
import cv2
except ImportError:
from ..utils.develop import create_dummy_class
DumpTensorAsImage = create_dummy_class('DumpTensorAsImage', 'cv2') # noqa
# alias
DumpParamAsImage = DumpTensorAsImage
DumpTensor = DumpTensors DumpTensor = DumpTensors
# -*- coding: utf-8 -*-
# File: misc.py
import os
import time
from collections import deque
import numpy as np
from .base import Callback
from ..utils.utils import humanize_time_delta
from ..utils import logger
__all__ = ['SendStat', 'InjectShell', 'EstimatedTimeLeft']
class SendStat(Callback):
""" An equivalent of :class:`SendMonitorData`, but as a normal callback. """
def __init__(self, command, names):
self.command = command
if not isinstance(names, list):
names = [names]
self.names = names
def _trigger(self):
M = self.trainer.monitors
v = {k: M.get_latest(k) for k in self.names}
cmd = self.command.format(**v)
ret = os.system(cmd)
if ret != 0:
logger.error("Command {} failed with ret={}!".format(cmd, ret))
class InjectShell(Callback):
"""
Allow users to create a specific file as a signal to pause
and iteratively debug the training.
Once triggered, it detects whether the file exists, and opens an
IPython/pdb shell if yes.
In the shell, `self` is this callback, `self.trainer` is the trainer, and
from that you can access everything else.
"""
def __init__(self, file='INJECT_SHELL.tmp', shell='ipython'):
"""
Args:
file (str): if this file exists, will open a shell.
shell (str): one of 'ipython', 'pdb'
"""
self._file = file
assert shell in ['ipython', 'pdb']
self._shell = shell
logger.info("Create a file '{}' to open {} shell.".format(file, shell))
def _trigger(self):
if os.path.isfile(self._file):
logger.info("File {} exists, entering shell.".format(self._file))
self._inject()
def _inject(self):
trainer = self.trainer # noqa
if self._shell == 'ipython':
import IPython as IP # noqa
IP.embed()
elif self._shell == 'pdb':
import pdb # noqa
pdb.set_trace()
def _after_train(self):
if os.path.isfile(self._file):
os.unlink(self._file)
class EstimatedTimeLeft(Callback):
"""
Estimate the time left until completion of training.
"""
def __init__(self, last_k_epochs=5):
"""
Args:
last_k_epochs (int): Use the time spent on last k epochs to
estimate total time left.
"""
self._times = deque(maxlen=last_k_epochs)
def _before_train(self):
self._max_epoch = self.trainer.max_epoch
self._last_time = time.time()
def _trigger_epoch(self):
duration = time.time() - self._last_time
self._last_time = time.time()
self._times.append(duration)
average_epoch_time = np.mean(self._times)
time_left = (self._max_epoch - self.epoch_num) * average_epoch_time
logger.info("Estimated Time Left: " + humanize_time_delta(time_left))
#!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# File: stats.py # File: stats.py
# for compatibility only
from .misc import InjectShell, SendStat # noqa
from .graph import DumpParamAsImage # noqa
import os __all__ = []
import numpy as np
from .base import Callback
from ..utils import logger
from ..tfutils.common import get_op_tensor_name
__all__ = ['SendStat', 'DumpParamAsImage', 'InjectShell']
class SendStat(Callback):
""" An equivalent of :class:`SendMonitorData`, but as a normal callback. """
def __init__(self, command, names):
self.command = command
if not isinstance(names, list):
names = [names]
self.names = names
def _trigger(self):
M = self.trainer.monitors
v = {k: M.get_latest(k) for k in self.names}
cmd = self.command.format(**v)
ret = os.system(cmd)
if ret != 0:
logger.error("Command {} failed with ret={}!".format(cmd, ret))
class InjectShell(Callback):
"""
Allow users to create a specific file as a signal to pause
and iteratively debug the training.
Once triggered, it detects whether the file exists, and opens an
IPython/pdb shell if yes.
In the shell, `self` is this callback, `self.trainer` is the trainer, and
from that you can access everything else.
"""
def __init__(self, file='INJECT_SHELL.tmp', shell='ipython'):
"""
Args:
file (str): if this file exists, will open a shell.
shell (str): one of 'ipython', 'pdb'
"""
self._file = file
assert shell in ['ipython', 'pdb']
self._shell = shell
logger.info("Create a file '{}' to open {} shell.".format(file, shell))
def _trigger(self):
if os.path.isfile(self._file):
logger.info("File {} exists, entering shell.".format(self._file))
self._inject()
def _inject(self):
trainer = self.trainer # noqa
if self._shell == 'ipython':
import IPython as IP # noqa
IP.embed()
elif self._shell == 'pdb':
import pdb # noqa
pdb.set_trace()
def _after_train(self):
if os.path.isfile(self._file):
os.unlink(self._file)
class DumpParamAsImage(Callback):
"""
Dump a tensor to image(s) to ``logger.get_logger_dir()`` once triggered.
Note that it requires the tensor is directly evaluable, i.e. either inputs
are not its dependency (e.g. the weights of the model), or the inputs are
feedfree (in which case this callback will take an extra datapoint from the input pipeline).
"""
def __init__(self, tensor_name, prefix=None, map_func=None, scale=255):
"""
Args:
tensor_name (str): the name of the tensor.
prefix (str): the filename prefix for saved images. Defaults to the Op name.
map_func: map the value of the tensor to an image or list of
images of shape [h, w] or [h, w, c]. If None, will use identity.
scale (float): a multiplier on pixel values, applied after map_func.
"""
op_name, self.tensor_name = get_op_tensor_name(tensor_name)
self.func = map_func
if prefix is None:
self.prefix = op_name
else:
self.prefix = prefix
self.log_dir = logger.get_logger_dir()
self.scale = scale
def _before_train(self):
self._tensor = self.graph.get_tensor_by_name(self.tensor_name)
def _trigger(self):
val = self.trainer.sess.run(self._tensor)
if self.func is not None:
val = self.func(val)
if isinstance(val, list) or val.ndim == 4:
for idx, im in enumerate(val):
self._dump_image(im, idx)
else:
self._dump_image(val)
self.trainer.monitors.put_image(self.prefix, val)
def _dump_image(self, im, idx=None):
assert im.ndim in [2, 3], str(im.ndim)
fname = os.path.join(
self.log_dir,
self.prefix + '-ep{:03d}{}.png'.format(
self.epoch_num, '-' + str(idx) if idx else ''))
res = im * self.scale
res = np.clip(res, 0, 255)
cv2.imwrite(fname, res.astype('uint8'))
try:
import cv2
except ImportError:
from ..utils.develop import create_dummy_class
DumpParamAsImage = create_dummy_class('DumpParamAsImage', 'cv2') # noqa
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment