Commit bc9d2e1a authored by Yuxin Wu's avatar Yuxin Wu

misc small changes

parent 4142b9e7
...@@ -38,14 +38,13 @@ Benchmark your DataFlow with modifications to understand which part is the bottl ...@@ -38,14 +38,13 @@ Benchmark your DataFlow with modifications to understand which part is the bottl
2. Gradually add some pre-processing and see how the performance changes. 2. Gradually add some pre-processing and see how the performance changes.
3. Change the number of parallel processes or threads. 3. Change the number of parallel processes or threads.
A DataFlow could be blocked by CPU/hard disk/network/IPC bandwidth. Only by benchmarking will you A DataFlow could be blocked by CPU/disk/network/IPC bandwidth. Only by benchmarking will you
know the reason and improve it accordingly, e.g.: know the reason and improve it accordingly, e.g.:
1. Use single-file database to avoid random read on hard disk. 1. Use single-file database to avoid random read on hard disk.
2. Write faster pre-processing with whatever tools you have. 2. Use fewer pre-processings or write faster ones with whatever tools you have.
3. Move certain pre-processing (e.g. mean/std normalization) to the graph, if TF has fast implementation of it. 3. Move certain pre-processing (e.g. mean/std normalization) to the graph, if TF has fast implementation of it.
4. Compress your data (e.g. use uint8 images, or JPEG-compressed images) before sending them through 4. Compress your data (e.g. use uint8 images, or JPEG-compressed images) before sending them through anything (network, ZMQ pipe, Python-TF copy etc.)
anything (network, ZMQ pipe, Python-TF copy etc.)
5. Use distributed data preprocessing, with `send_dataflow_zmq` and `RemoteDataZMQ`. 5. Use distributed data preprocessing, with `send_dataflow_zmq` and `RemoteDataZMQ`.
## Investigate TensorFlow ## Investigate TensorFlow
......
...@@ -28,7 +28,7 @@ To train, first decompress ImageNet data into [this structure](http://tensorpack ...@@ -28,7 +28,7 @@ To train, first decompress ImageNet data into [this structure](http://tensorpack
You should be able to see good GPU utilization (95%~99%), if your data is fast enough. You should be able to see good GPU utilization (95%~99%), if your data is fast enough.
It can finish training [within 20 hours](http://dawn.cs.stanford.edu/benchmark/ImageNet/train.html) on AWS p3.16xlarge. It can finish training [within 20 hours](http://dawn.cs.stanford.edu/benchmark/ImageNet/train.html) on AWS p3.16xlarge.
The default data pipeline is probably OK for most SSD systems. The default data pipeline is probably OK for machines with SSD + E5 CPUs.
See the [tutorial](http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html) on other options to speed up your data. See the [tutorial](http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html) on other options to speed up your data.
![imagenet](imagenet-resnet.png) ![imagenet](imagenet-resnet.png)
......
...@@ -58,7 +58,7 @@ def fbresnet_augmentor(isTrain): ...@@ -58,7 +58,7 @@ def fbresnet_augmentor(isTrain):
if isTrain: if isTrain:
augmentors = [ augmentors = [
GoogleNetResize(), GoogleNetResize(),
imgaug.RandomOrderAug( imgaug.RandomOrderAug( # Remove these augs if your CPU is not fast enough
[imgaug.BrightnessScale((0.6, 1.4), clip=False), [imgaug.BrightnessScale((0.6, 1.4), clip=False),
imgaug.Contrast((0.6, 1.4), clip=False), imgaug.Contrast((0.6, 1.4), clip=False),
imgaug.Saturation(0.4, rgb=False), imgaug.Saturation(0.4, rgb=False),
......
...@@ -30,7 +30,7 @@ GAN_FACTOR_PARAMETER = 2. ...@@ -30,7 +30,7 @@ GAN_FACTOR_PARAMETER = 2.
def normalize(v): def normalize(v):
assert isinstance(v, tf.Tensor) assert isinstance(v, tf.Tensor)
v.get_shape().assert_has_rank(4) v.get_shape().assert_has_rank(4)
return v / tf.reduce_mean(v, axis=[1, 2, 3], keep_dims=True) return v / tf.reduce_mean(v, axis=[1, 2, 3], keepdims=True)
def gram_matrix(v): def gram_matrix(v):
......
...@@ -336,7 +336,13 @@ class StatMonitorParamSetter(HyperParamSetter): ...@@ -336,7 +336,13 @@ class StatMonitorParamSetter(HyperParamSetter):
self.last_changed_epoch = 0 self.last_changed_epoch = 0
def _get_value_to_set(self): def _get_value_to_set(self):
try:
hist = self.trainer.monitors.get_history(self.stat_name) hist = self.trainer.monitors.get_history(self.stat_name)
except KeyError:
logger.warn(
"[StatMonitorParamSetter] Key {} not found in monitor history! Ignore it.".format(self.stat_name))
return None
if len(hist) < self.last_k + 1 or \ if len(hist) < self.last_k + 1 or \
self.epoch_num - self.last_changed_epoch < self.last_k: self.epoch_num - self.last_changed_epoch < self.last_k:
return None return None
......
...@@ -81,24 +81,36 @@ class CollectionGuard(object): ...@@ -81,24 +81,36 @@ class CollectionGuard(object):
def __init__(self, name, check_diff, def __init__(self, name, check_diff,
freeze_keys=[], freeze_keys=[],
diff_whitelist=[ diff_whitelist=None):
tf.GraphKeys.TRAINABLE_VARIABLES,
tf.GraphKeys.GLOBAL_VARIABLES,
tf.GraphKeys.QUEUE_RUNNERS,
tf.GraphKeys.LOCAL_VARIABLES]):
""" """
Args: Args:
name (str): name of the tower name (str): name of the tower
check_diff (bool): whether to test and print about collection change check_diff (bool): whether to check and print about collection change
when leaving this guard.
freeze_keys (list): list of keys to freeze freeze_keys (list): list of keys to freeze
diff_whitelist (list): list of keys to not print, when check_diff is True diff_whitelist (list): list of keys to ignore, when check_diff is True.
Defaults to some collections that are normally changed,
including variables, losses, contexts, queue runners.
""" """
self._name = name self._name = name
self._check_diff = check_diff self._check_diff = check_diff
if diff_whitelist is None:
diff_whitelist = CollectionGuard._default_diff_whitelist()
self._whitelist = set(diff_whitelist) self._whitelist = set(diff_whitelist)
self._freeze_keys = freeze_keys self._freeze_keys = freeze_keys
self._inverse_graphkeys = get_inverse_graphkeys() self._inverse_graphkeys = get_inverse_graphkeys()
@staticmethod
def _default_diff_whitelist():
ret = [tf.GraphKeys.TRAINABLE_VARIABLES,
tf.GraphKeys.GLOBAL_VARIABLES,
tf.GraphKeys.QUEUE_RUNNERS,
tf.GraphKeys.LOCAL_VARIABLES]
for newkey in ['COND_CONTEXT', 'WHILE_CONTEXT', 'LOSSES']:
if hasattr(tf.GraphKeys, newkey):
ret.append(getattr(tf.GraphKeys, newkey))
return ret
def _key_name(self, name): def _key_name(self, name):
return self._inverse_graphkeys.get(name, name) return self._inverse_graphkeys.get(name, name)
......
...@@ -8,7 +8,7 @@ from six.moves import zip ...@@ -8,7 +8,7 @@ from six.moves import zip
from ..utils import logger from ..utils import logger
from ..utils.argtools import call_only_once from ..utils.argtools import call_only_once
from ..utils.naming import TRAIN_TOWER_FREEZE_KEYS, PREDICT_TOWER_FREEZE_KEYS from ..utils.naming import MOVING_SUMMARY_OPS_KEY
from ..utils.develop import HIDE_DOC from ..utils.develop import HIDE_DOC
from .collection import CollectionGuard from .collection import CollectionGuard
from .common import get_tf_version_number, get_op_or_tensor_by_name, get_op_tensor_name from .common import get_tf_version_number, get_op_or_tensor_by_name, get_op_tensor_name
...@@ -122,8 +122,9 @@ class TowerContext(object): ...@@ -122,8 +122,9 @@ class TowerContext(object):
if self.is_main_training_tower: if self.is_main_training_tower:
return [] return []
if self.is_training: if self.is_training:
return TRAIN_TOWER_FREEZE_KEYS return [tf.GraphKeys.SUMMARIES, MOVING_SUMMARY_OPS_KEY]
return PREDICT_TOWER_FREEZE_KEYS # freeze UPDATE_OPS during inference because they should never be used
return [tf.GraphKeys.SUMMARIES, MOVING_SUMMARY_OPS_KEY, tf.GraphKeys.UPDATE_OPS]
def __enter__(self): def __enter__(self):
global _CurrentTowerContext global _CurrentTowerContext
......
...@@ -2,17 +2,7 @@ ...@@ -2,17 +2,7 @@
# File: naming.py # File: naming.py
import tensorflow as tf
GLOBAL_STEP_INCR_OP_NAME = 'global_step_incr' GLOBAL_STEP_INCR_OP_NAME = 'global_step_incr'
GLOBAL_STEP_INCR_VAR_NAME = 'global_step_incr:0'
# extra variables to summarize during training in a moving-average way # extra variables to summarize during training in a moving-average way
MOVING_SUMMARY_OPS_KEY = 'MOVING_SUMMARY_OPS' MOVING_SUMMARY_OPS_KEY = 'MOVING_SUMMARY_OPS'
SUMMARY_BACKUP_KEYS = [tf.GraphKeys.SUMMARIES, MOVING_SUMMARY_OPS_KEY]
TRAIN_TOWER_FREEZE_KEYS = SUMMARY_BACKUP_KEYS
PREDICT_TOWER_FREEZE_KEYS = SUMMARY_BACKUP_KEYS + [tf.GraphKeys.UPDATE_OPS]
# also freeze UPDATE_OPS in inference, because they should never be used
# TODO a better way to log and warn about collection change during build_graph.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment