Commit c68686e6 authored by Yuxin Wu's avatar Yuxin Wu

use uint8 in resnet example

parent 1a5d3f4f
...@@ -13,9 +13,11 @@ Most of them are the best reproducible results on gym. ...@@ -13,9 +13,11 @@ Most of them are the best reproducible results on gym.
It should run at a speed of 6~10 iteration/s on 1 GPU plus 12+ CPU cores. It should run at a speed of 6~10 iteration/s on 1 GPU plus 12+ CPU cores.
Training with a significant slower speed (e.g. on CPU) will result in very bad score, Training with a significant slower speed (e.g. on CPU) will result in very bad score,
probably because of async issues. probably because of async issues.
The pre-trained models are all trained with 4 GPUs for about 2 days. The pre-trained models are all trained with 4 GPUs for about 2 days.
But note that multi-GPU doesn't give you obvious speedup here, But note that multi-GPU doesn't give you obvious speedup here,
because the bottleneck is not computation but data. because the bottleneck is not computation but data. On machines without huge memory, you may also need to
enable tcmalloc to keep training throughput more stable.
Occasionally, processes may not get terminated completely, therefore it is suggested to use `systemd-run` to run any Occasionally, processes may not get terminated completely, therefore it is suggested to use `systemd-run` to run any
multiprocess Python program to get a cgroup dedicated for the task. multiprocess Python program to get a cgroup dedicated for the task.
......
...@@ -22,10 +22,9 @@ See "Rethinking the Inception Architecture for Computer Vision", arxiv:1512.0056 ...@@ -22,10 +22,9 @@ See "Rethinking the Inception Architecture for Computer Vision", arxiv:1512.0056
This config follows the official inceptionv3 setup This config follows the official inceptionv3 setup
(https://github.com/tensorflow/models/tree/master/inception/inception) (https://github.com/tensorflow/models/tree/master/inception/inception)
with much much fewer lines of code. with much much fewer lines of code.
It reaches 74% single-crop validation accuracy, It reaches 74% single-crop validation accuracy, similar to the official code.
and has the same running speed as the official code.
The hyperparameters here are for 8 GPUs, so the effective batch size is 8*64 = 512. The hyperparameters here are for 8 GPUs, so the effective batch size is 8*64 = 512.
With 8 TitanX it runs about 0.45 it/s.
""" """
TOTAL_BATCH_SIZE = 512 TOTAL_BATCH_SIZE = 512
......
...@@ -29,11 +29,16 @@ DEPTH = None ...@@ -29,11 +29,16 @@ DEPTH = None
class Model(ModelDesc): class Model(ModelDesc):
def _get_inputs(self): def _get_inputs(self):
return [InputVar(tf.float32, [None, INPUT_SHAPE, INPUT_SHAPE, 3], 'input'), return [InputVar(tf.uint8, [None, INPUT_SHAPE, INPUT_SHAPE, 3], 'input'),
InputVar(tf.int32, [None], 'label')] InputVar(tf.int32, [None], 'label')]
def _build_graph(self, inputs): def _build_graph(self, inputs):
image, label = inputs image, label = inputs
image = tf.cast(image, tf.float32) * (1.0 / 255)
image_mean = tf.constant([0.485, 0.456, 0.406], dtype=tf.float32)
image_std = tf.constant([0.229, 0.224, 0.225], dtype=tf.float32)
image = (image - image_mean) / image_std
def shortcut(l, n_in, n_out, stride): def shortcut(l, n_in, n_out, stride):
if n_in != n_out: if n_in != n_out:
...@@ -121,9 +126,6 @@ def get_data(train_or_test): ...@@ -121,9 +126,6 @@ def get_data(train_or_test):
datadir = args.data datadir = args.data
ds = dataset.ILSVRC12(datadir, train_or_test, ds = dataset.ILSVRC12(datadir, train_or_test,
shuffle=True if isTrain else False, dir_structure='original') shuffle=True if isTrain else False, dir_structure='original')
image_mean = np.array([0.485, 0.456, 0.406], dtype='float32')
image_std = np.array([0.229, 0.224, 0.225], dtype='float32')
if isTrain: if isTrain:
class Resize(imgaug.ImageAugmentor): class Resize(imgaug.ImageAugmentor):
""" """
...@@ -164,18 +166,18 @@ def get_data(train_or_test): ...@@ -164,18 +166,18 @@ def get_data(train_or_test):
)]), )]),
imgaug.Clip(), imgaug.Clip(),
imgaug.Flip(horiz=True), imgaug.Flip(horiz=True),
imgaug.MapImage(lambda x: (x * (1.0 / 255) - image_mean) / image_std), imgaug.ToUint8()
] ]
else: else:
augmentors = [ augmentors = [
imgaug.ResizeShortestEdge(256), imgaug.ResizeShortestEdge(256),
imgaug.CenterCrop((224, 224)), imgaug.CenterCrop((224, 224)),
imgaug.MapImage(lambda x: (x * (1.0 / 255) - image_mean) / image_std), imgaug.ToUint8()
] ]
ds = AugmentImageComponent(ds, augmentors) ds = AugmentImageComponent(ds, augmentors)
ds = BatchData(ds, BATCH_SIZE, remainder=not isTrain)
if isTrain: if isTrain:
ds = PrefetchDataZMQ(ds, min(20, multiprocessing.cpu_count())) ds = PrefetchDataZMQ(ds, min(20, multiprocessing.cpu_count()))
ds = BatchData(ds, BATCH_SIZE, remainder=not isTrain)
return ds return ds
......
...@@ -38,7 +38,9 @@ class TestDataSpeed(ProxyDataFlow): ...@@ -38,7 +38,9 @@ class TestDataSpeed(ProxyDataFlow):
Start testing with a progress bar. Start testing with a progress bar.
""" """
self.ds.reset_state() self.ds.reset_state()
with get_tqdm(total=self.test_size, leave=True) as pbar: # add smoothing for speed benchmark
with get_tqdm(total=self.test_size,
leave=True, smoothing=0.2) as pbar:
for idx, dp in enumerate(self.ds.get_data()): for idx, dp in enumerate(self.ds.get_data()):
pbar.update() pbar.update()
if idx == self.test_size - 1: if idx == self.test_size - 1:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment