Commit c667b1de authored by Yuxin Wu's avatar Yuxin Wu

CheckNumerics Callback

parent ac9ac2a4
...@@ -42,13 +42,13 @@ See `./alexnet.py --help` for usage. ...@@ -42,13 +42,13 @@ See `./alexnet.py --help` for usage.
### VGG16 ### VGG16
This VGG16 script, when trained with 8 GPUs and 32 batch size per GPU, reaches the following This VGG16 script, when trained with 8 GPUs and 32 batch size per GPU, reaches the following
validation error after 100 epochs (30h with 8 P100s). This is the code for the VGG validation error after 100 epochs (30h with 8 P100s). This reproduces the VGG
experiments in the paper [Group Normalization](https://arxiv.org/abs/1803.08494). experiments in the paper [Group Normalization](https://arxiv.org/abs/1803.08494).
See `./vgg16.py --help` for usage. See `./vgg16.py --help` for usage.
| No Normalization | Batch Normalization | Group Normalization | | No Normalization | Batch Normalization | Group Normalization |
|:------------------------------------------|---------------------|--------------------:| |:------------------------------------------|:-------------------:|:-------------------:|
| 29~30% (large variation with random seed) | 28% | 27.6% | | 29~30% (large variation with random seed) | 28% | 27.6% |
Note that the purpose of this experiment in the paper is not to claim GroupNorm Note that the purpose of this experiment in the paper is not to claim GroupNorm
has better performance than BatchNorm. has better performance than BatchNorm.
......
...@@ -14,7 +14,7 @@ from ..utils import logger ...@@ -14,7 +14,7 @@ from ..utils import logger
from .base import Callback from .base import Callback
__all__ = ['RunOp', 'RunUpdateOps', 'ProcessTensors', 'DumpTensors', __all__ = ['RunOp', 'RunUpdateOps', 'ProcessTensors', 'DumpTensors',
'DumpTensor', 'DumpTensorAsImage', 'DumpParamAsImage'] 'DumpTensor', 'DumpTensorAsImage', 'DumpParamAsImage', 'CheckNumerics']
class RunOp(Callback): class RunOp(Callback):
...@@ -213,6 +213,20 @@ class DumpTensorAsImage(Callback): ...@@ -213,6 +213,20 @@ class DumpTensorAsImage(Callback):
cv2.imwrite(fname, res.astype('uint8')) cv2.imwrite(fname, res.astype('uint8'))
class CheckNumerics(Callback):
"""
When triggered, check variables in the graph for NaN and Inf.
Raise exceptions if such an error is found.
"""
def _setup_graph(self):
vars = tf.trainable_variables()
ops = [tf.check_numerics(v, "CheckNumerics['{}']".format(v.op.name)).op for v in vars]
self._check_op = tf.group(*ops)
def _trigger(self):
self._check_op.run()
try: try:
import cv2 import cv2
except ImportError: except ImportError:
......
...@@ -167,4 +167,7 @@ def Dropout(x, *args, **kwargs): ...@@ -167,4 +167,7 @@ def Dropout(x, *args, **kwargs):
if kwargs.get('training', None) is None: if kwargs.get('training', None) is None:
kwargs['training'] = get_current_tower_context().is_training kwargs['training'] = get_current_tower_context().is_training
return tf.layers.dropout(x, rate=rate, **kwargs) if get_tf_version_tuple() <= (1, 12):
return tf.layers.dropout(x, rate=rate, **kwargs)
else:
return tf.nn.dropout(x, rate=rate if kwargs['training'] else 0.)
...@@ -341,14 +341,14 @@ class HorovodTrainer(SingleCostTrainer): ...@@ -341,14 +341,14 @@ class HorovodTrainer(SingleCostTrainer):
+ Make sure your InputSource has reasonable randomness. + Make sure your InputSource has reasonable randomness.
+ If your data processing is heavy, doing it in a separate dedicated process might be + If your data processing is heavy, doing it in a single dedicated process might be
a better choice than doing them repeatedly in each process. a better choice than doing them repeatedly in each process.
+ You need to make sure log directories in each process won't conflict. + You need to make sure log directories in each process won't conflict.
You can set it only for the chief process, or set a different one for each process. You can set it only for the chief process, or set a different one for each process.
+ Callbacks have an option to be run only in the chief process, or in all processes. + Callbacks have an option to be run only in the chief process, or in all processes.
See :meth:`callback.set_chief_only()`. Most callbacks have a reasonable See :meth:`Callback.set_chief_only()`. Most callbacks have a reasonable
default already, but certain callbacks may not behave properly by default. Report an issue if you find any. default already, but certain callbacks may not behave properly by default. Report an issue if you find any.
+ You can use Horovod API such as `hvd.rank()` to know which process you are and choose + You can use Horovod API such as `hvd.rank()` to know which process you are and choose
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment