CheckNumerics Callback

c667b1de · Yuxin Wu · ac9ac2a4 · c667b1de · c667b1de · c667b1de
Commit c667b1de authored Dec 22, 2018 by Yuxin Wu
4 changed files
--- a/examples/ImageNetModels/README.md
+++ b/examples/ImageNetModels/README.md
@@ -42,13 +42,13 @@ See `./alexnet.py --help` for usage.
 ### VGG16

 This VGG16 script, when trained with 8 GPUs and 32 batch size per GPU, reaches the following
-validation error after 100 epochs (30h with 8 P100s). This is the code for the VGG
+validation error after 100 epochs (30h with 8 P100s). This reproduces the VGG
 experiments in the paper [Group Normalization](https://arxiv.org/abs/1803.08494).
 See `./vgg16.py --help` for usage.

 | No Normalization                          | Batch Normalization | Group Normalization |
- |:------------------------------------------|---------------------|--------------------:|
- | 29~30% (large variation with random seed) | 28%                 |               27.6% |
+ |:------------------------------------------|:-------------------:|:-------------------:|
+ | 29~30% (large variation with random seed) | 28%                 | 27.6%               |

 Note that the purpose of this experiment in the paper is not to claim GroupNorm
 has better performance than BatchNorm.

--- a/tensorpack/callbacks/graph.py
+++ b/tensorpack/callbacks/graph.py
@@ -14,7 +14,7 @@ from ..utils import logger
 from .base import Callback

 __all__ = ['RunOp', 'RunUpdateOps', 'ProcessTensors', 'DumpTensors',
-           'DumpTensor', 'DumpTensorAsImage', 'DumpParamAsImage']
+           'DumpTensor', 'DumpTensorAsImage', 'DumpParamAsImage', 'CheckNumerics']


 class RunOp(Callback):
@@ -213,6 +213,20 @@ class DumpTensorAsImage(Callback):
        cv2.imwrite(fname, res.astype('uint8'))


+class CheckNumerics(Callback):
+    """
+    When triggered, check variables in the graph for NaN and Inf.
+    Raise exceptions if such an error is found.
+    """
+    def _setup_graph(self):
+        vars = tf.trainable_variables()
+        ops = [tf.check_numerics(v, "CheckNumerics['{}']".format(v.op.name)).op for v in vars]
+        self._check_op = tf.group(*ops)
+
+    def _trigger(self):
+        self._check_op.run()
+
+
 try:
    import cv2
 except ImportError:

--- a/tensorpack/models/regularize.py
+++ b/tensorpack/models/regularize.py
@@ -167,4 +167,7 @@ def Dropout(x, *args, **kwargs):
    if kwargs.get('training', None) is None:
        kwargs['training'] = get_current_tower_context().is_training

-    return tf.layers.dropout(x, rate=rate, **kwargs)
+    if get_tf_version_tuple() <= (1, 12):
+        return tf.layers.dropout(x, rate=rate, **kwargs)
+    else:
+        return tf.nn.dropout(x, rate=rate if kwargs['training'] else 0.)
--- a/tensorpack/train/trainers.py
+++ b/tensorpack/train/trainers.py
@@ -341,14 +341,14 @@ class HorovodTrainer(SingleCostTrainer):

           + Make sure your InputSource has reasonable randomness.

-           + If your data processing is heavy, doing it in a separate dedicated process might be
+           + If your data processing is heavy, doing it in a single dedicated process might be
             a better choice than doing them repeatedly in each process.

           + You need to make sure log directories in each process won't conflict.
             You can set it only for the chief process, or set a different one for each process.

           + Callbacks have an option to be run only in the chief process, or in all processes.
-             See :meth:`callback.set_chief_only()`. Most callbacks have a reasonable
+             See :meth:`Callback.set_chief_only()`. Most callbacks have a reasonable
             default already, but certain callbacks may not behave properly by default. Report an issue if you find any.

           + You can use Horovod API such as `hvd.rank()` to know which process you are and choose