misc small improvements

70d95c17 · Yuxin Wu · 6dce5c4b · 70d95c17 · 70d95c17 · 70d95c17
Commit 70d95c17 authored Mar 08, 2018 by Yuxin Wu
6 changed files
--- a/docs/tutorial/extend/callback.md
+++ b/docs/tutorial/extend/callback.md
@@ -57,8 +57,9 @@ You can overwrite any of the following methods to define a new callback:
 * `_before_epoch(self)`, `_after_epoch(self)`
-  Use them __only__ when you really need something to happen __immediately__ before/after an epoch.
+  `_trigger_epoch` should be enough for most cases, as can be seen from the scheduling snippet above.
-  Otherwise, `_trigger_epoch` should be enough.
+  Use these two methods __only__ when you really need something to happen __immediately__ before/after an epoch.
+	And when you do need to use them, make sure they are very very fast to avoid affecting other callbacks which use them.
 * `_before_run(self, ctx)`, `_after_run(self, ctx, values)`

--- a/tensorpack/callbacks/prof.py
+++ b/tensorpack/callbacks/prof.py
@@ -66,6 +66,10 @@ class GPUUtilizationTracker(Callback):
        while self._evt.is_set():   # unlikely
            pass
        self._evt.set()
+    def _trigger_epoch(self):
+        # Don't do this in after_epoch because
+        # before,after_epoch are supposed to be extremely fast by design.
        stats = self._queue.get()
        for idx, dev in enumerate(self._devices):
            self.trainer.monitors.put_scalar('GPUUtil/{}'.format(dev), stats[idx])

--- a/tensorpack/dataflow/dataset/mnist.py
+++ b/tensorpack/dataflow/dataset/mnist.py
@@ -120,6 +120,7 @@ class FashionMnist(Mnist):
 if __name__ == '__main__':
    ds = Mnist('train')
+    ds.reset_state()
    for (img, label) in ds.get_data():
        from IPython import embed
        embed()

--- a/tensorpack/input_source/input_source.py
+++ b/tensorpack/input_source/input_source.py
@@ -456,11 +456,17 @@ class TFDatasetInput(FeedfreeInput):
    def dataflow_to_dataset(df, types):
        """
        Wrap a dataflow to tf.data.Dataset.
-        Will reset df.
+        Will also reset the dataflow.
+        If for training, you'll need to add `.repeat()` on the returned
+        dataset, if the dataflow iterator can terminate.
        Args:
            df (DataFlow)
            types([tf.DType])
+        Returns:
+            (tf.data.Dataset)
        """
        assert isinstance(df, DataFlow), df
        assert isinstance(types, (list, tuple)), types

--- a/tensorpack/libinfo.py
+++ b/tensorpack/libinfo.py
@@ -27,11 +27,17 @@ os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'   # use more warm-up
 # Since 1.3, this is not needed
 os.environ['TF_AVGPOOL_USE_CUDNN'] = '1'   # issue#8566
-# TF1.5 features from tensorflow/benchmarks
+# TF1.5 features
 os.environ['TF_SYNC_ON_FINISH'] = '0'   # will become default
 os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
 os.environ['TF_GPU_THREAD_COUNT'] = '2'
+# Available in TF1.6+. Haven't seen different performance on R50.
+# NOTE TF set it to 0 by default, because:
+# this mode may use scaled atomic integer reduction that may cause a numerical
+# overflow for certain input data range.
+# os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
 try:
    import tensorflow as tf  # noqa
    _version = tf.__version__.split('.')

--- a/tensorpack/train/base.py
+++ b/tensorpack/train/base.py
@@ -253,8 +253,8 @@ class Trainer(object):
                for self.loop._epoch_num in range(
                        self.loop.starting_epoch, self.loop.max_epoch + 1):
                    logger.info("Start Epoch {} ...".format(self.loop.epoch_num))
-                    start_time = time.time()
                    self._callbacks.before_epoch()
+                    start_time = time.time()
                    for self.loop._local_step in range(self.loop.steps_per_epoch):
                        if self.hooked_sess.should_stop():
                            return
@@ -267,8 +267,8 @@ class Trainer(object):
                    # trigger epoch outside the timing region.
                    self._callbacks.trigger_epoch()
                logger.info("Training has finished!")
-            except (StopTraining, tf.errors.OutOfRangeError):
+            except (StopTraining, tf.errors.OutOfRangeError) as e:
-                logger.info("Training was stopped.")
+                logger.info("Training was stopped by exception {}.".format(str(e)))
            except KeyboardInterrupt:
                logger.info("Detected Ctrl-C and exiting main loop.")
                raise