Commit 740e9d8c authored by Yuxin Wu's avatar Yuxin Wu

[Incompatible] refactor inferencer. ClassificationError now takes a vector

parent 148d7dd9
...@@ -29,7 +29,7 @@ Describe your training task with three components: ...@@ -29,7 +29,7 @@ Describe your training task with three components:
+ Use Python to easily handle any of your own data format, yet still keep a good training speed thanks to multiprocess prefetch & TF Queue prefetch. + Use Python to easily handle any of your own data format, yet still keep a good training speed thanks to multiprocess prefetch & TF Queue prefetch.
For example, InceptionV3 can run in the same speed as the official code which reads data using TF operators. For example, InceptionV3 can run in the same speed as the official code which reads data using TF operators.
3. Callbacks, including everything you want to do apart from the training iterations. Such as: 3. Callbacks, including everything you want to do apart from the training iterations, such as:
+ Change hyperparameters during training + Change hyperparameters during training
+ Print some variables of interest + Print some variables of interest
+ Run inference on a test dataset + Run inference on a test dataset
...@@ -49,7 +49,7 @@ Multi-GPU training is off-the-shelf by simply switching the trainer. ...@@ -49,7 +49,7 @@ Multi-GPU training is off-the-shelf by simply switching the trainer.
pip install --user -r requirements.txt pip install --user -r requirements.txt
pip install --user -r opt-requirements.txt (some optional dependencies, you can install later if needed) pip install --user -r opt-requirements.txt (some optional dependencies, you can install later if needed)
``` ```
+ Use [tcmalloc](http://goog-perftools.sourceforge.net/doc/tcmalloc.html) whenever possible + [tcmalloc](http://goog-perftools.sourceforge.net/doc/tcmalloc.html) usually helps.
+ Enable `import tensorpack`: + Enable `import tensorpack`:
``` ```
export PYTHONPATH=$PYTHONPATH:`readlink -f path/to/tensorpack` export PYTHONPATH=$PYTHONPATH:`readlink -f path/to/tensorpack`
......
...@@ -40,7 +40,6 @@ class Model(mnist_example.Model): ...@@ -40,7 +40,6 @@ class Model(mnist_example.Model):
prob = tf.nn.softmax(logits, name='prob') prob = tf.nn.softmax(logits, name='prob')
wrong = symbolic_functions.prediction_incorrect(logits, label) wrong = symbolic_functions.prediction_incorrect(logits, label)
nr_wrong = tf.reduce_sum(wrong, name='wrong')
add_moving_summary(tf.reduce_mean(wrong, name='train_error')) add_moving_summary(tf.reduce_mean(wrong, name='train_error'))
cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label)
......
...@@ -142,11 +142,9 @@ class Model(ModelDesc): ...@@ -142,11 +142,9 @@ class Model(ModelDesc):
cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label)
cost = tf.reduce_mean(cost, name='cross_entropy_loss') cost = tf.reduce_mean(cost, name='cross_entropy_loss')
wrong = prediction_incorrect(logits, label, 1) wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
nr_wrong = tf.reduce_sum(wrong, name='wrong-top1')
add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))
wrong = prediction_incorrect(logits, label, 5) wrong = prediction_incorrect(logits, label, 5, name='wrong-top5')
nr_wrong = tf.reduce_sum(wrong, name='wrong-top5')
add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5')) add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5'))
# weight decay on all W of fc layers # weight decay on all W of fc layers
......
...@@ -109,7 +109,6 @@ class Model(ModelDesc): ...@@ -109,7 +109,6 @@ class Model(ModelDesc):
# compute the number of failed samples # compute the number of failed samples
wrong = prediction_incorrect(logits, label) wrong = prediction_incorrect(logits, label)
nr_wrong = tf.reduce_sum(wrong, name='wrong')
# monitor training error # monitor training error
add_moving_summary(tf.reduce_mean(wrong, name='train_error')) add_moving_summary(tf.reduce_mean(wrong, name='train_error'))
......
...@@ -102,12 +102,10 @@ class Model(ModelDesc): ...@@ -102,12 +102,10 @@ class Model(ModelDesc):
cost = tf.add_n([loss3, 0.3 * loss2, 0.3 * loss1], name='weighted_cost') cost = tf.add_n([loss3, 0.3 * loss2, 0.3 * loss1], name='weighted_cost')
add_moving_summary([cost, loss1, loss2, loss3]) add_moving_summary([cost, loss1, loss2, loss3])
wrong = prediction_incorrect(logits, label, 1) wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
nr_wrong = tf.reduce_sum(wrong, name='wrong-top1')
add_moving_summary(tf.reduce_mean(wrong, name='train_error_top1')) add_moving_summary(tf.reduce_mean(wrong, name='train_error_top1'))
wrong = prediction_incorrect(logits, label, 5) wrong = prediction_incorrect(logits, label, 5, name='wrong-top5')
nr_wrong = tf.reduce_sum(wrong, name='wrong-top5')
add_moving_summary(tf.reduce_mean(wrong, name='train_error_top5')) add_moving_summary(tf.reduce_mean(wrong, name='train_error_top5'))
# weight decay on all W of fc layers # weight decay on all W of fc layers
......
...@@ -180,12 +180,10 @@ class Model(ModelDesc): ...@@ -180,12 +180,10 @@ class Model(ModelDesc):
loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label) loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label)
loss2 = tf.reduce_mean(loss2, name='loss2') loss2 = tf.reduce_mean(loss2, name='loss2')
wrong = prediction_incorrect(logits, label, 1) wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
nr_wrong = tf.reduce_sum(wrong, name='wrong-top1')
add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))
wrong = prediction_incorrect(logits, label, 5) wrong = prediction_incorrect(logits, label, 5, name='wrong-top5')
nr_wrong = tf.reduce_sum(wrong, name='wrong-top5')
add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5')) add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5'))
# weight decay on all W of fc layers # weight decay on all W of fc layers
......
...@@ -101,7 +101,6 @@ class Model(ModelDesc): ...@@ -101,7 +101,6 @@ class Model(ModelDesc):
cost = tf.reduce_mean(cost, name='cross_entropy_loss') cost = tf.reduce_mean(cost, name='cross_entropy_loss')
wrong = prediction_incorrect(logits, label) wrong = prediction_incorrect(logits, label)
nr_wrong = tf.reduce_sum(wrong, name='wrong')
# monitor training error # monitor training error
add_moving_summary(tf.reduce_mean(wrong, name='train_error')) add_moving_summary(tf.reduce_mean(wrong, name='train_error'))
......
...@@ -110,12 +110,10 @@ class Model(ModelDesc): ...@@ -110,12 +110,10 @@ class Model(ModelDesc):
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label)
loss = tf.reduce_mean(loss, name='xentropy-loss') loss = tf.reduce_mean(loss, name='xentropy-loss')
wrong = prediction_incorrect(logits, label, 1) wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
nr_wrong = tf.reduce_sum(wrong, name='wrong-top1')
add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1')) add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))
wrong = prediction_incorrect(logits, label, 5) wrong = prediction_incorrect(logits, label, 5, name='wrong-top5')
nr_wrong = tf.reduce_sum(wrong, name='wrong-top5')
add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5')) add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5'))
# weight decay on all W of fc layers # weight decay on all W of fc layers
......
...@@ -76,7 +76,6 @@ class Model(ModelDesc): ...@@ -76,7 +76,6 @@ class Model(ModelDesc):
cost = tf.reduce_mean(cost, name='cross_entropy_loss') cost = tf.reduce_mean(cost, name='cross_entropy_loss')
wrong = symbolic_functions.prediction_incorrect(logits, label) wrong = symbolic_functions.prediction_incorrect(logits, label)
nr_wrong = tf.reduce_sum(wrong, name='wrong')
summary.add_moving_summary(tf.reduce_mean(wrong, name='train_error')) summary.add_moving_summary(tf.reduce_mean(wrong, name='train_error'))
wd_cost = tf.mul(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss), wd_cost = tf.mul(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss),
......
...@@ -59,9 +59,7 @@ class Model(ModelDesc): ...@@ -59,9 +59,7 @@ class Model(ModelDesc):
cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label)
cost = tf.reduce_mean(cost, name='cross_entropy_loss') cost = tf.reduce_mean(cost, name='cross_entropy_loss')
# compute the number of failed samples, for ClassificationError to use at test time
wrong = symbf.prediction_incorrect(logits, label) wrong = symbf.prediction_incorrect(logits, label)
nr_wrong = tf.reduce_sum(wrong, name='wrong')
# monitor training error # monitor training error
add_moving_summary(tf.reduce_mean(wrong, name='train_error')) add_moving_summary(tf.reduce_mean(wrong, name='train_error'))
......
...@@ -62,9 +62,8 @@ class Model(ModelDesc): ...@@ -62,9 +62,8 @@ class Model(ModelDesc):
cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label) # a vector of length B with loss of each sample cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label) # a vector of length B with loss of each sample
cost = tf.reduce_mean(cost, name='cross_entropy_loss') # the average cross-entropy loss cost = tf.reduce_mean(cost, name='cross_entropy_loss') # the average cross-entropy loss
# compute the number of failed samples, for thee callback ClassificationError to use at test time # compute the "incorrect vector", for the callback ClassificationError to use at validation time
wrong = symbolic_functions.prediction_incorrect(logits, label) wrong = symbolic_functions.prediction_incorrect(logits, label, name='incorrect')
nr_wrong = tf.reduce_sum(wrong, name='wrong')
# This will monitor training error (in a moving_average fashion): # This will monitor training error (in a moving_average fashion):
# 1. write the value to tensosrboard # 1. write the value to tensosrboard
...@@ -117,7 +116,7 @@ def get_config(): ...@@ -117,7 +116,7 @@ def get_config():
InferenceRunner( # run inference(for validation) after every epoch InferenceRunner( # run inference(for validation) after every epoch
dataset_test, # the DataFlow instance used for validation dataset_test, # the DataFlow instance used for validation
# Calculate both the cost and the error for this DataFlow # Calculate both the cost and the error for this DataFlow
[ScalarStats('cost'), ClassificationError() ]), [ScalarStats('cost'), ClassificationError('incorrect')]),
]), ]),
model=Model(), model=Model(),
step_per_epoch=step_per_epoch, step_per_epoch=step_per_epoch,
......
...@@ -46,7 +46,6 @@ class Model(ModelDesc): ...@@ -46,7 +46,6 @@ class Model(ModelDesc):
# compute the number of failed samples, for ClassificationError to use at test time # compute the number of failed samples, for ClassificationError to use at test time
wrong = prediction_incorrect(logits, label) wrong = prediction_incorrect(logits, label)
nr_wrong = tf.reduce_sum(wrong, name='wrong')
# monitor training error # monitor training error
add_moving_summary(tf.reduce_mean(wrong, name='train_error')) add_moving_summary(tf.reduce_mean(wrong, name='train_error'))
......
...@@ -11,10 +11,11 @@ import six ...@@ -11,10 +11,11 @@ import six
from six.moves import zip, map from six.moves import zip, map
from ..dataflow import DataFlow from ..dataflow import DataFlow
from ..utils import get_tqdm_kwargs, logger from ..utils import get_tqdm_kwargs, logger, execute_only_once
from ..utils.stat import RatioCounter, BinaryStatistics from ..utils.stat import RatioCounter, BinaryStatistics
from ..tfutils import get_op_tensor_name, get_op_var_name from ..tfutils import get_op_tensor_name, get_op_var_name
from .base import Callback from .base import Callback
from .dispatcher import OutputTensorDispatcer
__all__ = ['InferenceRunner', 'ClassificationError', __all__ = ['InferenceRunner', 'ClassificationError',
'ScalarStats', 'Inferencer', 'BinaryClassificationStats'] 'ScalarStats', 'Inferencer', 'BinaryClassificationStats']
...@@ -31,14 +32,14 @@ class Inferencer(object): ...@@ -31,14 +32,14 @@ class Inferencer(object):
def _before_inference(self): def _before_inference(self):
pass pass
def datapoint(self, dp, output): def datapoint(self, _, output):
""" """
Called after complete running every data point Called after complete running every data point
""" """
self._datapoint(dp, output) self._datapoint(_, output)
@abstractmethod @abstractmethod
def _datapoint(self, dp, output): def _datapoint(self, _, output):
pass pass
def after_inference(self): def after_inference(self):
...@@ -97,21 +98,24 @@ class InferenceRunner(Callback): ...@@ -97,21 +98,24 @@ class InferenceRunner(Callback):
self.input_tensors = [x.name for x in input_vars] self.input_tensors = [x.name for x in input_vars]
def _find_output_tensors(self): def _find_output_tensors(self):
dispatcer = OutputTensorDispatcer()
for inf in self.infs:
dispatcer.add_entry(inf.get_output_tensors())
all_names = dispatcer.get_all_names()
IOTensor = InferenceRunner.IOTensor IOTensor = InferenceRunner.IOTensor
self.output_tensors = [] self.output_tensors = list(filter(
def find_oid(t): lambda x: x not in self.input_tensors, all_names))
tensorname = get_op_tensor_name(t)[1] def find_oid(idxs):
if tensorname in self.input_tensors: ret = []
# this inferencer needs the input dp for idx in idxs:
return IOTensor(self.input_tensors.index(tensorname), False) name = all_names[idx]
if t in self.output_tensors: if name in self.input_tensors:
return IOTensor(self.output_tensors.index(t), True) ret.append(IOTensor(self.input_tensors.index(name), False))
else: else:
self.output_tensors.append(t) ret.append(IOTensor(self.output_tensors.index(name), True))
return IOTensor(len(self.output_tensors) - 1, True) return ret
self.inf_to_tensors = [ self.inf_to_tensors = [find_oid(t) for t in dispatcer.get_idx_for_each_entry()]
[find_oid(t) for t in inf.get_output_tensors()]
for inf in self.infs]
# list of list of (var_name: IOTensor) # list of list of (var_name: IOTensor)
def _trigger_epoch(self): def _trigger_epoch(self):
...@@ -162,7 +166,7 @@ class ScalarStats(Inferencer): ...@@ -162,7 +166,7 @@ class ScalarStats(Inferencer):
def _before_inference(self): def _before_inference(self):
self.stats = [] self.stats = []
def _datapoint(self, dp, output): def _datapoint(self, _, output):
self.stats.append(output) self.stats.append(output)
def _after_inference(self): def _after_inference(self):
...@@ -180,15 +184,16 @@ class ClassificationError(Inferencer): ...@@ -180,15 +184,16 @@ class ClassificationError(Inferencer):
""" """
Compute classification error in batch mode, from a `wrong` variable Compute classification error in batch mode, from a `wrong` variable
The `wrong` variable is supposed to be an integer equal to the number of failed samples in this batch. The `wrong` tensor is supposed to be an 0/1 integer vector containing
You can use `tf.nn.in_top_k` to record top-k error as well. whether each sample in the batch is incorrectly classified.
You can use `tf.nn.in_top_k` to produce this vector record top-k error as well.
This callback produce the "true" error, This callback produce the "true" error,
taking account of the fact that batches might not have the same size in taking account of the fact that batches might not have the same size in
testing (because the size of test set might not be a multiple of batch size). testing (because the size of test set might not be a multiple of batch size).
Therefore the result is different from averaging the error rate of each batch. Therefore the result is different from averaging the error rate of each batch.
""" """
def __init__(self, wrong_var_name='wrong:0', summary_name='val_error'): def __init__(self, wrong_var_name='incorrect_vector', summary_name='val_error'):
""" """
:param wrong_var_name: name of the `wrong` variable :param wrong_var_name: name of the `wrong` variable
:param summary_name: the name for logging :param summary_name: the name for logging
...@@ -202,9 +207,18 @@ class ClassificationError(Inferencer): ...@@ -202,9 +207,18 @@ class ClassificationError(Inferencer):
def _before_inference(self): def _before_inference(self):
self.err_stat = RatioCounter() self.err_stat = RatioCounter()
def _datapoint(self, dp, outputs): def _datapoint(self, _, outputs):
batch_size = dp[0].shape[0] # assume batched input vec = outputs[0]
wrong = int(outputs[0]) if vec.ndim == 0:
if execute_only_once():
logger.warn("[DEPRECATED] use a 'wrong vector' for ClassificationError instead of nr_wrong")
batch_size = _[0].shape[0] # assume batched input
wrong = int(vec)
else:
# TODO put shape assertion into inferencerrunner
assert vec.ndim == 1, "{} is not a vector!".format(self.wrong_var_name)
batch_size = len(vec)
wrong = np.sum(vec)
self.err_stat.feed(wrong, batch_size) self.err_stat.feed(wrong, batch_size)
def _after_inference(self): def _after_inference(self):
...@@ -230,7 +244,7 @@ class BinaryClassificationStats(Inferencer): ...@@ -230,7 +244,7 @@ class BinaryClassificationStats(Inferencer):
def _before_inference(self): def _before_inference(self):
self.stat = BinaryStatistics() self.stat = BinaryStatistics()
def _datapoint(self, dp, outputs): def _datapoint(self, _, outputs):
pred, label = outputs pred, label = outputs
self.stat.feed(pred, label) self.stat.feed(pred, label)
......
...@@ -6,13 +6,14 @@ import tensorflow as tf ...@@ -6,13 +6,14 @@ import tensorflow as tf
import numpy as np import numpy as np
from ..utils import logger from ..utils import logger
def prediction_incorrect(logits, label, topk=1): def prediction_incorrect(logits, label, topk=1, name='incorrect_vector'):
""" """
:param logits: NxC :param logits: NxC
:param label: N :param label: N
:returns: a float32 vector of length N with 0/1 values, 1 meaning incorrect prediction :returns: a float32 vector of length N with 0/1 values, 1 meaning incorrect prediction
""" """
return tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, topk)), tf.float32) return tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, topk)),
tf.float32, name=name)
def flatten(x): def flatten(x):
""" """
......
...@@ -143,7 +143,7 @@ class AsyncMultiGPUTrainer(MultiGPUTrainer): ...@@ -143,7 +143,7 @@ class AsyncMultiGPUTrainer(MultiGPUTrainer):
async_step_total_cnt = int(re.findall( async_step_total_cnt = int(re.findall(
'[0-9]+', self.async_step_counter.__str__())[0]) '[0-9]+', self.async_step_counter.__str__())[0])
self.write_scalar_summary( self.write_scalar_summary(
'async-global-step', async_step_total_cnt) 'async_global_step', async_step_total_cnt)
except: except:
logger.exception("Cannot log async-global-step") logger.exception("Cannot log async_global_step")
super(AsyncMultiGPUTrainer, self)._trigger_epoch() super(AsyncMultiGPUTrainer, self)._trigger_epoch()
...@@ -115,7 +115,6 @@ class EnqueueThread(threading.Thread): ...@@ -115,7 +115,6 @@ class EnqueueThread(threading.Thread):
def run(self): def run(self):
self.dataflow.reset_state() self.dataflow.reset_state()
with self.sess.as_default(): with self.sess.as_default():
try: try:
while True: while True:
......
...@@ -94,6 +94,12 @@ def get_rng(obj=None): ...@@ -94,6 +94,12 @@ def get_rng(obj=None):
_EXECUTE_HISTORY = set() _EXECUTE_HISTORY = set()
def execute_only_once(): def execute_only_once():
"""
when called with:
if execute_only_once():
# do something
The body is guranteed to be executed only the first time.
"""
f = inspect.currentframe().f_back f = inspect.currentframe().f_back
ident = (f.f_code.co_filename, f.f_lineno) ident = (f.f_code.co_filename, f.f_lineno)
if ident in _EXECUTE_HISTORY: if ident in _EXECUTE_HISTORY:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment