Commit 70d95c17 authored by Yuxin Wu's avatar Yuxin Wu

misc small improvements

parent 6dce5c4b
......@@ -57,8 +57,9 @@ You can overwrite any of the following methods to define a new callback:
* `_before_epoch(self)`, `_after_epoch(self)`
Use them __only__ when you really need something to happen __immediately__ before/after an epoch.
Otherwise, `_trigger_epoch` should be enough.
`_trigger_epoch` should be enough for most cases, as can be seen from the scheduling snippet above.
Use these two methods __only__ when you really need something to happen __immediately__ before/after an epoch.
And when you do need to use them, make sure they are very very fast to avoid affecting other callbacks which use them.
* `_before_run(self, ctx)`, `_after_run(self, ctx, values)`
......
......@@ -66,6 +66,10 @@ class GPUUtilizationTracker(Callback):
while self._evt.is_set(): # unlikely
pass
self._evt.set()
def _trigger_epoch(self):
# Don't do this in after_epoch because
# before,after_epoch are supposed to be extremely fast by design.
stats = self._queue.get()
for idx, dev in enumerate(self._devices):
self.trainer.monitors.put_scalar('GPUUtil/{}'.format(dev), stats[idx])
......
......@@ -120,6 +120,7 @@ class FashionMnist(Mnist):
if __name__ == '__main__':
ds = Mnist('train')
ds.reset_state()
for (img, label) in ds.get_data():
from IPython import embed
embed()
......
......@@ -456,11 +456,17 @@ class TFDatasetInput(FeedfreeInput):
def dataflow_to_dataset(df, types):
"""
Wrap a dataflow to tf.data.Dataset.
Will reset df.
Will also reset the dataflow.
If for training, you'll need to add `.repeat()` on the returned
dataset, if the dataflow iterator can terminate.
Args:
df (DataFlow)
types([tf.DType])
Returns:
(tf.data.Dataset)
"""
assert isinstance(df, DataFlow), df
assert isinstance(types, (list, tuple)), types
......
......@@ -27,11 +27,17 @@ os.environ['TF_AUTOTUNE_THRESHOLD'] = '2' # use more warm-up
# Since 1.3, this is not needed
os.environ['TF_AVGPOOL_USE_CUDNN'] = '1' # issue#8566
# TF1.5 features from tensorflow/benchmarks
# TF1.5 features
os.environ['TF_SYNC_ON_FINISH'] = '0' # will become default
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_GPU_THREAD_COUNT'] = '2'
# Available in TF1.6+. Haven't seen different performance on R50.
# NOTE TF set it to 0 by default, because:
# this mode may use scaled atomic integer reduction that may cause a numerical
# overflow for certain input data range.
# os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
try:
import tensorflow as tf # noqa
_version = tf.__version__.split('.')
......
......@@ -253,8 +253,8 @@ class Trainer(object):
for self.loop._epoch_num in range(
self.loop.starting_epoch, self.loop.max_epoch + 1):
logger.info("Start Epoch {} ...".format(self.loop.epoch_num))
start_time = time.time()
self._callbacks.before_epoch()
start_time = time.time()
for self.loop._local_step in range(self.loop.steps_per_epoch):
if self.hooked_sess.should_stop():
return
......@@ -267,8 +267,8 @@ class Trainer(object):
# trigger epoch outside the timing region.
self._callbacks.trigger_epoch()
logger.info("Training has finished!")
except (StopTraining, tf.errors.OutOfRangeError):
logger.info("Training was stopped.")
except (StopTraining, tf.errors.OutOfRangeError) as e:
logger.info("Training was stopped by exception {}.".format(str(e)))
except KeyboardInterrupt:
logger.info("Detected Ctrl-C and exiting main loop.")
raise
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment