misc small changes

bc9d2e1a · Yuxin Wu · 4142b9e7 · bc9d2e1a · bc9d2e1a · bc9d2e1a
Commit bc9d2e1a authored Mar 01, 2018 by Yuxin Wu
8 changed files
--- a/docs/tutorial/performance-tuning.md
+++ b/docs/tutorial/performance-tuning.md
@@ -38,14 +38,13 @@ Benchmark your DataFlow with modifications to understand which part is the bottl
 2. Gradually add some pre-processing and see how the performance changes.
 3. Change the number of parallel processes or threads.
-A DataFlow could be blocked by CPU/hard disk/network/IPC bandwidth. Only by benchmarking will you
+A DataFlow could be blocked by CPU/disk/network/IPC bandwidth. Only by benchmarking will you
 know the reason and improve it accordingly, e.g.:
 1. Use single-file database to avoid random read on hard disk.
-2. Write faster pre-processing with whatever tools you have.
+2. Use fewer pre-processings or write faster ones with whatever tools you have.
 3. Move certain pre-processing (e.g. mean/std normalization) to the graph, if TF has fast implementation of it.
-4. Compress your data (e.g. use uint8 images, or JPEG-compressed images) before sending them through
+4. Compress your data (e.g. use uint8 images, or JPEG-compressed images) before sending them through anything (network, ZMQ pipe, Python-TF copy etc.)
-	 anything (network, ZMQ pipe, Python-TF copy etc.)
 5. Use distributed data preprocessing, with `send_dataflow_zmq` and `RemoteDataZMQ`.
 ## Investigate TensorFlow

--- a/examples/ResNet/README.md
+++ b/examples/ResNet/README.md
@@ -28,7 +28,7 @@ To train, first decompress ImageNet data into [this structure](http://tensorpack
 You should be able to see good GPU utilization (95%~99%), if your data is fast enough.
 It can finish training [within 20 hours](http://dawn.cs.stanford.edu/benchmark/ImageNet/train.html) on AWS p3.16xlarge.
-The default data pipeline is probably OK for most SSD systems.
+The default data pipeline is probably OK for machines with SSD + E5 CPUs.
 See the [tutorial](http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html) on other options to speed up your data.
 ![imagenet](imagenet-resnet.png)

--- a/examples/ResNet/imagenet_utils.py
+++ b/examples/ResNet/imagenet_utils.py
@@ -58,7 +58,7 @@ def fbresnet_augmentor(isTrain):
    if isTrain:
        augmentors = [
            GoogleNetResize(),
-            imgaug.RandomOrderAug(
+            imgaug.RandomOrderAug(      # Remove these augs if your CPU is not fast enough
                [imgaug.BrightnessScale((0.6, 1.4), clip=False),
                 imgaug.Contrast((0.6, 1.4), clip=False),
                 imgaug.Saturation(0.4, rgb=False),

--- a/examples/SuperResolution/enet-pat.py
+++ b/examples/SuperResolution/enet-pat.py
@@ -30,7 +30,7 @@ GAN_FACTOR_PARAMETER = 2.
 def normalize(v):
    assert isinstance(v, tf.Tensor)
    v.get_shape().assert_has_rank(4)
-    return v / tf.reduce_mean(v, axis=[1, 2, 3], keep_dims=True)
+    return v / tf.reduce_mean(v, axis=[1, 2, 3], keepdims=True)
 def gram_matrix(v):

--- a/tensorpack/callbacks/param.py
+++ b/tensorpack/callbacks/param.py
@@ -336,7 +336,13 @@ class StatMonitorParamSetter(HyperParamSetter):
        self.last_changed_epoch = 0
    def _get_value_to_set(self):
+        try:
            hist = self.trainer.monitors.get_history(self.stat_name)
+        except KeyError:
+            logger.warn(
+                "[StatMonitorParamSetter] Key {} not found in monitor history! Ignore it.".format(self.stat_name))
+            return None
        if len(hist) < self.last_k + 1 or \
                self.epoch_num - self.last_changed_epoch < self.last_k:
            return None

--- a/tensorpack/tfutils/collection.py
+++ b/tensorpack/tfutils/collection.py
@@ -81,24 +81,36 @@ class CollectionGuard(object):
    def __init__(self, name, check_diff,
                 freeze_keys=[],
-                 diff_whitelist=[
+                 diff_whitelist=None):
-                     tf.GraphKeys.TRAINABLE_VARIABLES,
-                     tf.GraphKeys.GLOBAL_VARIABLES,
-                     tf.GraphKeys.QUEUE_RUNNERS,
-                     tf.GraphKeys.LOCAL_VARIABLES]):
        """
        Args:
           name (str): name of the tower
-           check_diff (bool): whether to test and print about collection change
+           check_diff (bool): whether to check and print about collection change
+                when leaving this guard.
           freeze_keys (list): list of keys to freeze
-           diff_whitelist (list): list of keys to not print, when check_diff is True
+           diff_whitelist (list): list of keys to ignore, when check_diff is True.
+                Defaults to some collections that are normally changed,
+                including variables, losses, contexts, queue runners.
        """
        self._name = name
        self._check_diff = check_diff
+        if diff_whitelist is None:
+            diff_whitelist = CollectionGuard._default_diff_whitelist()
        self._whitelist = set(diff_whitelist)
        self._freeze_keys = freeze_keys
        self._inverse_graphkeys = get_inverse_graphkeys()
+    @staticmethod
+    def _default_diff_whitelist():
+        ret = [tf.GraphKeys.TRAINABLE_VARIABLES,
+               tf.GraphKeys.GLOBAL_VARIABLES,
+               tf.GraphKeys.QUEUE_RUNNERS,
+               tf.GraphKeys.LOCAL_VARIABLES]
+        for newkey in ['COND_CONTEXT', 'WHILE_CONTEXT', 'LOSSES']:
+            if hasattr(tf.GraphKeys, newkey):
+                ret.append(getattr(tf.GraphKeys, newkey))
+        return ret
    def _key_name(self, name):
        return self._inverse_graphkeys.get(name, name)

--- a/tensorpack/tfutils/tower.py
+++ b/tensorpack/tfutils/tower.py
@@ -8,7 +8,7 @@ from six.moves import zip
 from ..utils import logger
 from ..utils.argtools import call_only_once
-from ..utils.naming import TRAIN_TOWER_FREEZE_KEYS, PREDICT_TOWER_FREEZE_KEYS
+from ..utils.naming import MOVING_SUMMARY_OPS_KEY
 from ..utils.develop import HIDE_DOC
 from .collection import CollectionGuard
 from .common import get_tf_version_number, get_op_or_tensor_by_name, get_op_tensor_name
@@ -122,8 +122,9 @@ class TowerContext(object):
        if self.is_main_training_tower:
            return []
        if self.is_training:
-            return TRAIN_TOWER_FREEZE_KEYS
+            return [tf.GraphKeys.SUMMARIES, MOVING_SUMMARY_OPS_KEY]
-        return PREDICT_TOWER_FREEZE_KEYS
+        # freeze UPDATE_OPS during inference because they should never be used
+        return [tf.GraphKeys.SUMMARIES, MOVING_SUMMARY_OPS_KEY, tf.GraphKeys.UPDATE_OPS]
    def __enter__(self):
        global _CurrentTowerContext

--- a/tensorpack/utils/naming.py
+++ b/tensorpack/utils/naming.py
@@ -2,17 +2,7 @@
 # File: naming.py
-import tensorflow as tf
 GLOBAL_STEP_INCR_OP_NAME = 'global_step_incr'
-GLOBAL_STEP_INCR_VAR_NAME = 'global_step_incr:0'
 # extra variables to summarize during training in a moving-average way
 MOVING_SUMMARY_OPS_KEY = 'MOVING_SUMMARY_OPS'
-SUMMARY_BACKUP_KEYS = [tf.GraphKeys.SUMMARIES, MOVING_SUMMARY_OPS_KEY]
-TRAIN_TOWER_FREEZE_KEYS = SUMMARY_BACKUP_KEYS
-PREDICT_TOWER_FREEZE_KEYS = SUMMARY_BACKUP_KEYS + [tf.GraphKeys.UPDATE_OPS]
-# also freeze UPDATE_OPS in inference, because they should never be used
-# TODO a better way to log and warn about collection change during build_graph.