Don't use too many prediction_incorrect

9672e503 · Yuxin Wu · 86c9df35 · 9672e503 · 9672e503 · 9672e503
Commit 9672e503 authored Oct 31, 2017 by Yuxin Wu
6 changed files
--- a/docs/tutorial/performance-tuning.md
+++ b/docs/tutorial/performance-tuning.md
@@ -10,7 +10,7 @@ Here's a list of things you can do when your training is slow:
 2. If you use queue-based input + dataflow, you can look for the queue size statistics in
 	 training log. Ideally the queue should be near-full (default size is 50).
 	 If the size is near-zero, data is the bottleneck.
-3. If the GPU utilization is low, it may be because of slow data, or some ops are on CPU. Also make sure GPUs are not locked in P8 state.
+3. If the GPU utilization is low, it may be because of slow data, or some ops are inefficient. Also make sure GPUs are not locked in P8 state.

 ## Benchmark the components
 1. Use `DummyConstantInput(shapes)` as the `InputSource`.
@@ -67,7 +67,10 @@ But there may be something cheap you can try:
 ### Cannot scale to multi-GPU
 If you're unable to scale to multiple GPUs almost linearly:
 1. First make sure that the ResNet example can scale. Run it with `--fake` to use fake data.
-2. Then note that your model may have a different communication-computation pattern.
+	If not, it's a bug or an environment setup problem.
+2. Then note that your model may have a different communication-computation pattern or other
+	 characteristics that affects efficiency.
+	 There isn't a simple answer to this.
 	 Changing different multi-GPU trainers may affect the speed significantly sometimes.

 Note that scalibility measurement always trains with the same "batch size per GPU", not the same total equivalent batch size.
--- a/examples/cifar-convnet.py
+++ b/examples/cifar-convnet.py
@@ -67,9 +67,9 @@ class Model(ModelDesc):
        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

-        wrong = symbf.prediction_incorrect(logits, label)
+        correct = tf.to_float(tf.nn.in_top_k(logits, label, 1), name='correct')
        # monitor training error
-        add_moving_summary(tf.reduce_mean(wrong, name='train_error'))
+        add_moving_summary(tf.reduce_mean(correct, name='accuracy'))

        # weight decay on all W of fc layers
        wd_cost = regularize_cost('fc.*/W', l2_regularizer(4e-4), name='regularize_loss')
@@ -127,7 +127,8 @@ def get_config(cifar_classnum):
        dataflow=dataset_train,
        callbacks=[
            ModelSaver(),
-            InferenceRunner(dataset_test, ClassificationError()),
+            InferenceRunner(dataset_test,
+                            ScalarStats(['accuracy', 'cost'])),
            StatMonitorParamSetter('learning_rate', 'val_error', lr_func,
                                   threshold=0.001, last_k=10),
        ],

--- a/examples/mnist-convnet.py
+++ b/examples/mnist-convnet.py
@@ -63,7 +63,6 @@ class Model(ModelDesc):
        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')  # the average cross-entropy loss

-        # compute the "correct vector", for the callback ClassificationError to use at validation time
        correct = tf.cast(tf.nn.in_top_k(logits, label, 1), tf.float32, name='correct')
        accuracy = tf.reduce_mean(correct, name='accuracy')

@@ -118,9 +117,7 @@ def get_config():
            MaxSaver('validation_accuracy'),  # save the model with highest accuracy (prefix 'validation_')
            InferenceRunner(    # run inference(for validation) after every epoch
                dataset_test,   # the DataFlow instance used for validation
-                # Calculate both the cost and the accuracy for this DataFlow
-                [ScalarStats('cross_entropy_loss'),
-                 ClassificationError('correct', 'validation_accuracy')]),
+                ScalarStats(['cross_entropy_loss', 'accuracy'])),
        ],
        steps_per_epoch=steps_per_epoch,
        max_epoch=100,

--- a/examples/mnist-keras.py
+++ b/examples/mnist-keras.py
@@ -61,9 +61,9 @@ class Model(ModelDesc):
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')  # the average cross-entropy loss

        # for tensorpack validation
-        wrong = symbolic_functions.prediction_incorrect(logits, label, name='incorrect')
-        train_error = tf.reduce_mean(wrong, name='train_error')
-        summary.add_moving_summary(train_error)
+        acc = tf.to_float(tf.nn.in_top_k(logits, label, 1))
+        acc = tf.reduce_mean(acc, name='accuracy')
+        summary.add_moving_summary(acc)

        wd_cost = tf.add_n(M.losses, name='regularize_loss')    # this is how Keras manage regularizers
        self.cost = tf.add_n([wd_cost, cost], name='total_cost')
@@ -97,7 +97,7 @@ if __name__ == '__main__':
            ModelSaver(),
            InferenceRunner(
                dataset_test,
-                [ScalarStats('cross_entropy_loss'), ClassificationError('incorrect')]),
+                ScalarStats(['cross_entropy_loss', 'accuracy'])),
        ],
        max_epoch=100,
    )

--- a/examples/mnist-tfslim.py
+++ b/examples/mnist-tfslim.py
@@ -53,10 +53,10 @@ class Model(ModelDesc):
        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

-        wrong = symbolic_functions.prediction_incorrect(logits, label, name='incorrect')
+        acc = tf.to_float(tf.nn.in_top_k(logits, label, 1))

-        train_error = tf.reduce_mean(wrong, name='train_error')
-        summary.add_moving_summary(train_error)
+        acc = tf.reduce_mean(acc, name='accuracy')
+        summary.add_moving_summary(acc)

        self.cost = cost
        summary.add_moving_summary(cost)
@@ -88,7 +88,7 @@ def get_config():
            ModelSaver(),
            InferenceRunner(
                dataset_test,
-                [ScalarStats('cross_entropy_loss'), ClassificationError('incorrect')]),
+                ScalarStats(['cross_entropy_loss', 'accuracy'])),
        ],
        max_epoch=100,
    )

--- a/tensorpack/callbacks/inference.py
+++ b/tensorpack/callbacks/inference.py
@@ -139,8 +139,9 @@ class ClassificationError(Inferencer):
    whether each sample in the batch is *incorrectly* classified.
    You can use ``tf.nn.in_top_k`` to produce this vector.

-    This Inferencer produces the "true" error,
-    taking account of the fact that batches might not have the same size in
+    This Inferencer produces the "true" error, which could be different from
+    `ScalarStats('error_rate')`.
+    It takes account of the fact that batches might not have the same size in
    testing (because the size of test set might not be a multiple of batch size).
    Therefore the result can be different from averaging the error rate of each batch.

@@ -152,8 +153,6 @@ class ClassificationError(Inferencer):
        """
        Args:
            wrong_tensor_name(str): name of the ``wrong`` tensor.
-                The default is the same as the default output name of
-                :meth:`prediction_incorrect`.
            summary_name(str): the name to log the error with.
        """
        self.wrong_tensor_name = wrong_tensor_name