Commit c68686e6 authored by Yuxin Wu's avatar Yuxin Wu

use uint8 in resnet example

parent 1a5d3f4f
......@@ -13,9 +13,11 @@ Most of them are the best reproducible results on gym.
It should run at a speed of 6~10 iteration/s on 1 GPU plus 12+ CPU cores.
Training with a significant slower speed (e.g. on CPU) will result in very bad score,
probably because of async issues.
The pre-trained models are all trained with 4 GPUs for about 2 days.
But note that multi-GPU doesn't give you obvious speedup here,
because the bottleneck is not computation but data.
because the bottleneck is not computation but data. On machines without huge memory, you may also need to
enable tcmalloc to keep training throughput more stable.
Occasionally, processes may not get terminated completely, therefore it is suggested to use `systemd-run` to run any
multiprocess Python program to get a cgroup dedicated for the task.
......
......@@ -22,10 +22,9 @@ See "Rethinking the Inception Architecture for Computer Vision", arxiv:1512.0056
This config follows the official inceptionv3 setup
(https://github.com/tensorflow/models/tree/master/inception/inception)
with much much fewer lines of code.
It reaches 74% single-crop validation accuracy,
and has the same running speed as the official code.
It reaches 74% single-crop validation accuracy, similar to the official code.
The hyperparameters here are for 8 GPUs, so the effective batch size is 8*64 = 512.
With 8 TitanX it runs about 0.45 it/s.
"""
TOTAL_BATCH_SIZE = 512
......
......@@ -29,11 +29,16 @@ DEPTH = None
class Model(ModelDesc):
def _get_inputs(self):
return [InputVar(tf.float32, [None, INPUT_SHAPE, INPUT_SHAPE, 3], 'input'),
return [InputVar(tf.uint8, [None, INPUT_SHAPE, INPUT_SHAPE, 3], 'input'),
InputVar(tf.int32, [None], 'label')]
def _build_graph(self, inputs):
image, label = inputs
image = tf.cast(image, tf.float32) * (1.0 / 255)
image_mean = tf.constant([0.485, 0.456, 0.406], dtype=tf.float32)
image_std = tf.constant([0.229, 0.224, 0.225], dtype=tf.float32)
image = (image - image_mean) / image_std
def shortcut(l, n_in, n_out, stride):
if n_in != n_out:
......@@ -121,9 +126,6 @@ def get_data(train_or_test):
datadir = args.data
ds = dataset.ILSVRC12(datadir, train_or_test,
shuffle=True if isTrain else False, dir_structure='original')
image_mean = np.array([0.485, 0.456, 0.406], dtype='float32')
image_std = np.array([0.229, 0.224, 0.225], dtype='float32')
if isTrain:
class Resize(imgaug.ImageAugmentor):
"""
......@@ -164,18 +166,18 @@ def get_data(train_or_test):
)]),
imgaug.Clip(),
imgaug.Flip(horiz=True),
imgaug.MapImage(lambda x: (x * (1.0 / 255) - image_mean) / image_std),
imgaug.ToUint8()
]
else:
augmentors = [
imgaug.ResizeShortestEdge(256),
imgaug.CenterCrop((224, 224)),
imgaug.MapImage(lambda x: (x * (1.0 / 255) - image_mean) / image_std),
imgaug.ToUint8()
]
ds = AugmentImageComponent(ds, augmentors)
ds = BatchData(ds, BATCH_SIZE, remainder=not isTrain)
if isTrain:
ds = PrefetchDataZMQ(ds, min(20, multiprocessing.cpu_count()))
ds = BatchData(ds, BATCH_SIZE, remainder=not isTrain)
return ds
......
......@@ -38,7 +38,9 @@ class TestDataSpeed(ProxyDataFlow):
Start testing with a progress bar.
"""
self.ds.reset_state()
with get_tqdm(total=self.test_size, leave=True) as pbar:
# add smoothing for speed benchmark
with get_tqdm(total=self.test_size,
leave=True, smoothing=0.2) as pbar:
for idx, dp in enumerate(self.ds.get_data()):
pbar.update()
if idx == self.test_size - 1:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment