Move ImageNet models together

0ba89131 · Yuxin Wu · e9a4df1b · 0ba89131 · 0ba89131 · 0ba89131
Commit 0ba89131 authored Mar 07, 2018 by Yuxin Wu
11 changed files
--- a/examples/DoReFa-Net/imagenet_utils.py
+++ b/examples/DoReFa-Net/imagenet_utils.py
-../ResNet/imagenet_utils.py
\ No newline at end of file
+../ImageNetModels/imagenet_utils.py
\ No newline at end of file
--- a/examples/ImageNetModels/README.md
+++ b/examples/ImageNetModels/README.md
+
+ImageNet training code of ResNet, Inception, VGG, ShuffleNet, DoReFa-Net with tensorpack.
+
+To train any of the models, just do `./{model}.py --data /path/to/ilsvrc`.
+Expected format of data directory is described in [docs](http://tensorpack.readthedocs.io/en/latest/modules/dataflow.dataset.html#tensorpack.dataflow.dataset.ILSVRC12).
+Pretrained models can be downloaded at [tensorpack model zoo](http://models.tensorpack.com/).
+
+### ShuffleNet
+
+Reproduce [ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices](https://arxiv.org/abs/1707.01083)
+on ImageNet.
+
+This is a 38Mflops ShuffleNet, corresponding to `ShuffleNet 0.5x g=3` in [version 2](https://arxiv.org/pdf/1707.01083v2) of the paper.
+After 240 epochs (36 hours on 8 P100s) it reaches top-1 error of 42.32%, better than the paper's number.
+
+To print flops:
+```bash
+./shufflenet.py --flops
+```
+It will print about 75Mflops, because the paper counts multiply+add as 1 flop.
+
+Evaluate the [pretrained model](http://models.tensorpack.com/ShuffleNet/):
+```
+./shufflenet.py --eval --data /path/to/ilsvrc --load /path/to/model
+```
+
+### Inception-BN, VGG16
+
+This Inception-BN script reaches 27% single-crop error after 300k steps with 6 GPUs.
+
+This VGG16 script reaches 28.8% single-crop error after 100 epochs.
+
+### ResNet, DoReFa-Net
+
+See [ResNet examples](../ResNet) and [DoReFa-Net examples](../DoReFa-Net).
--- a/examples/ImageNetModels/imagenet_utils.py
+++ b/examples/ImageNetModels/imagenet_utils.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# File: imagenet_utils.py
+
+
+import cv2
+import numpy as np
+import multiprocessing
+import tensorflow as tf
+from abc import abstractmethod
+
+from tensorpack import imgaug, dataset, ModelDesc, InputDesc
+from tensorpack.dataflow import (
+    AugmentImageComponent, PrefetchDataZMQ,
+    BatchData, MultiThreadMapData)
+from tensorpack.predict import PredictConfig, SimpleDatasetPredictor
+from tensorpack.utils.stats import RatioCounter
+from tensorpack.models import regularize_cost
+from tensorpack.tfutils.summary import add_moving_summary
+from tensorpack.utils import logger
+
+
+class GoogleNetResize(imgaug.ImageAugmentor):
+    """
+    crop 8%~100% of the original image
+    See `Going Deeper with Convolutions` by Google.
+    """
+    def __init__(self, crop_area_fraction=0.08,
+                 aspect_ratio_low=0.75, aspect_ratio_high=1.333,
+                 target_shape=224):
+        self._init(locals())
+
+    def _augment(self, img, _):
+        h, w = img.shape[:2]
+        area = h * w
+        for _ in range(10):
+            targetArea = self.rng.uniform(self.crop_area_fraction, 1.0) * area
+            aspectR = self.rng.uniform(self.aspect_ratio_low, self.aspect_ratio_high)
+            ww = int(np.sqrt(targetArea * aspectR) + 0.5)
+            hh = int(np.sqrt(targetArea / aspectR) + 0.5)
+            if self.rng.uniform() < 0.5:
+                ww, hh = hh, ww
+            if hh <= h and ww <= w:
+                x1 = 0 if w == ww else self.rng.randint(0, w - ww)
+                y1 = 0 if h == hh else self.rng.randint(0, h - hh)
+                out = img[y1:y1 + hh, x1:x1 + ww]
+                out = cv2.resize(out, (self.target_shape, self.target_shape), interpolation=cv2.INTER_CUBIC)
+                return out
+        out = imgaug.ResizeShortestEdge(self.target_shape, interp=cv2.INTER_CUBIC).augment(img)
+        out = imgaug.CenterCrop(self.target_shape).augment(out)
+        return out
+
+
+def fbresnet_augmentor(isTrain):
+    """
+    Augmentor used in fb.resnet.torch, for BGR images in range [0,255].
+    """
+    if isTrain:
+        augmentors = [
+            GoogleNetResize(),
+            imgaug.RandomOrderAug(      # Remove these augs if your CPU is not fast enough
+                [imgaug.BrightnessScale((0.6, 1.4), clip=False),
+                 imgaug.Contrast((0.6, 1.4), clip=False),
+                 imgaug.Saturation(0.4, rgb=False),
+                 # rgb-bgr conversion for the constants copied from fb.resnet.torch
+                 imgaug.Lighting(0.1,
+                                 eigval=np.asarray(
+                                     [0.2175, 0.0188, 0.0045][::-1]) * 255.0,
+                                 eigvec=np.array(
+                                     [[-0.5675, 0.7192, 0.4009],
+                                      [-0.5808, -0.0045, -0.8140],
+                                      [-0.5836, -0.6948, 0.4203]],
+                                     dtype='float32')[::-1, ::-1]
+                                 )]),
+            imgaug.Flip(horiz=True),
+        ]
+    else:
+        augmentors = [
+            imgaug.ResizeShortestEdge(256, cv2.INTER_CUBIC),
+            imgaug.CenterCrop((224, 224)),
+        ]
+    return augmentors
+
+
+def get_imagenet_dataflow(
+        datadir, name, batch_size,
+        augmentors, parallel=None):
+    """
+    See explanations in the tutorial:
+    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
+    """
+    assert name in ['train', 'val', 'test']
+    assert datadir is not None
+    assert isinstance(augmentors, list)
+    isTrain = name == 'train'
+    if parallel is None:
+        parallel = min(40, multiprocessing.cpu_count() // 2)  # assuming hyperthreading
+    if isTrain:
+        ds = dataset.ILSVRC12(datadir, name, shuffle=True)
+        ds = AugmentImageComponent(ds, augmentors, copy=False)
+        if parallel < 16:
+            logger.warn("DataFlow may become the bottleneck when too few processes are used.")
+        ds = PrefetchDataZMQ(ds, parallel)
+        ds = BatchData(ds, batch_size, remainder=False)
+    else:
+        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
+        aug = imgaug.AugmentorList(augmentors)
+
+        def mapf(dp):
+            fname, cls = dp
+            im = cv2.imread(fname, cv2.IMREAD_COLOR)
+            im = aug.augment(im)
+            return im, cls
+        ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True)
+        ds = BatchData(ds, batch_size, remainder=True)
+        ds = PrefetchDataZMQ(ds, 1)
+    return ds
+
+
+def eval_on_ILSVRC12(model, sessinit, dataflow):
+    pred_config = PredictConfig(
+        model=model,
+        session_init=sessinit,
+        input_names=['input', 'label'],
+        output_names=['wrong-top1', 'wrong-top5']
+    )
+    pred = SimpleDatasetPredictor(pred_config, dataflow)
+    acc1, acc5 = RatioCounter(), RatioCounter()
+    for top1, top5 in pred.get_result():
+        batch_size = top1.shape[0]
+        acc1.feed(top1.sum(), batch_size)
+        acc5.feed(top5.sum(), batch_size)
+    print("Top1 Error: {}".format(acc1.ratio))
+    print("Top5 Error: {}".format(acc5.ratio))
+
+
+class ImageNetModel(ModelDesc):
+    weight_decay = 1e-4
+    image_shape = 224
+
+    """
+    uint8 instead of float32 is used as input type to reduce copy overhead.
+    It might hurt the performance a liiiitle bit.
+    The pretrained models were trained with float32.
+    """
+    image_dtype = tf.uint8
+
+    def __init__(self, data_format='NCHW'):
+        self.data_format = data_format
+
+    def _get_inputs(self):
+        return [InputDesc(self.image_dtype, [None, self.image_shape, self.image_shape, 3], 'input'),
+                InputDesc(tf.int32, [None], 'label')]
+
+    def _build_graph(self, inputs):
+        image, label = inputs
+        image = ImageNetModel.image_preprocess(image, bgr=True)
+        if self.data_format == 'NCHW':
+            image = tf.transpose(image, [0, 3, 1, 2])
+
+        logits = self.get_logits(image)
+        loss = ImageNetModel.compute_loss_and_error(logits, label)
+
+        if self.weight_decay > 0:
+            wd_loss = regularize_cost('.*/W', tf.contrib.layers.l2_regularizer(self.weight_decay),
+                                      name='l2_regularize_loss')
+            add_moving_summary(loss, wd_loss)
+            self.cost = tf.add_n([loss, wd_loss], name='cost')
+        else:
+            self.cost = tf.identity(loss, name='cost')
+            add_moving_summary(self.cost)
+
+    @abstractmethod
+    def get_logits(self, image):
+        """
+        Args:
+            image: 4D tensor of 224x224 in ``self.data_format``
+
+        Returns:
+            Nx1000 logits
+        """
+
+    def _get_optimizer(self):
+        lr = tf.get_variable('learning_rate', initializer=0.1, trainable=False)
+        tf.summary.scalar('learning_rate-summary', lr)
+        return tf.train.MomentumOptimizer(lr, 0.9, use_nesterov=True)
+
+    @staticmethod
+    def image_preprocess(image, bgr=True):
+        with tf.name_scope('image_preprocess'):
+            if image.dtype.base_dtype != tf.float32:
+                image = tf.cast(image, tf.float32)
+            image = image * (1.0 / 255)
+
+            mean = [0.485, 0.456, 0.406]    # rgb
+            std = [0.229, 0.224, 0.225]
+            if bgr:
+                mean = mean[::-1]
+                std = std[::-1]
+            image_mean = tf.constant(mean, dtype=tf.float32)
+            image_std = tf.constant(std, dtype=tf.float32)
+            image = (image - image_mean) / image_std
+            return image
+
+    @staticmethod
+    def compute_loss_and_error(logits, label):
+        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
+        loss = tf.reduce_mean(loss, name='xentropy-loss')
+
+        def prediction_incorrect(logits, label, topk=1, name='incorrect_vector'):
+            with tf.name_scope('prediction_incorrect'):
+                x = tf.logical_not(tf.nn.in_top_k(logits, label, topk))
+            return tf.cast(x, tf.float32, name=name)
+
+        wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
+        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))
+
+        wrong = prediction_incorrect(logits, label, 5, name='wrong-top5')
+        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5'))
+        return loss
+
+
+if __name__ == '__main__':
+    import argparse
+    from tensorpack.dataflow import TestDataSpeed
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data', required=True)
+    parser.add_argument('--batch', type=int, default=32)
+    parser.add_argument('--aug', choices=['train', 'val'], default='val')
+    args = parser.parse_args()
+
+    if args.aug == 'val':
+        augs = fbresnet_augmentor(False)
+    elif args.aug == 'train':
+        augs = fbresnet_augmentor(True)
+    df = get_imagenet_dataflow(
+        args.data, 'train', args.batch, augs)
+    # For val augmentor, Should get >100 it/s (i.e. 3k im/s) here on a decent E5 server.
+    TestDataSpeed(df).start()
--- a/examples/Inception/inception-bn.py
+++ b/examples/Inception/inception-bn.py
@@ -22,14 +22,6 @@ NR_GPU = 6
 BATCH_SIZE = TOTAL_BATCH_SIZE // NR_GPU
 INPUT_SHAPE = 224

-"""
-Inception-BN model on ILSVRC12.
-See "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift", arxiv:1502.03167
-
-This config reaches 73% single-crop validation accuracy after 300k steps with 6 GPUs.
-"""
-
-
 class Model(ModelDesc):
    def _get_inputs(self):
        return [InputDesc(tf.float32, [None, INPUT_SHAPE, INPUT_SHAPE, 3], 'input'),

--- a/examples/ShuffleNet/shufflenet.py
+++ b/examples/ShuffleNet/shufflenet.py
--- a/examples/ImageNetModels/vgg16.py
+++ b/examples/ImageNetModels/vgg16.py
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+# File: vgg16.py
+
+import sys
+import argparse
+import numpy as np
+import os
+from itertools import count
+
+import tensorflow as tf
+
+from tensorpack import *
+from tensorpack.models import *
+from tensorpack.callbacks import *
+from tensorpack.train import TrainConfig, SyncMultiGPUTrainerParameterServer
+from tensorpack.dataflow import imgaug
+from tensorpack.tfutils import argscope, get_model_loader, get_current_tower_context
+from tensorpack.tfutils.summary import *
+from tensorpack.utils.gpu import get_nr_gpu
+
+from imagenet_utils import (
+    ImageNetModel, get_imagenet_dataflow, fbresnet_augmentor)
+
+
+def convnormrelu(x, name, chan):
+    x = Conv2D(name, x, chan, 3)
+    if args.norm == 'bn':
+        x = BatchNorm(name + '_bn', x)
+    x = tf.nn.relu(x, name=name + '_relu')
+    return x
+
+
+class Model(ImageNetModel):
+    weight_decay = 5e-4
+
+    def get_logits(self, image):
+        with argscope(Conv2D, kernel_shape=3,
+                      W_init=tf.variance_scaling_initializer(scale=2.)), \
+                  argscope([Conv2D, MaxPooling, BatchNorm], data_format='NCHW'):
+            logits = (LinearWrap(image)
+                      .apply(convnormrelu, 'conv1_1', 64)
+                      .apply(convnormrelu, 'conv1_2', 64)
+                      .MaxPooling('pool1', 2)
+                      # 112
+                      .apply(convnormrelu, 'conv2_1', 128)
+                      .apply(convnormrelu, 'conv2_2', 128)
+                      .MaxPooling('pool2', 2)
+                      # 56
+                      .apply(convnormrelu, 'conv3_1', 256)
+                      .apply(convnormrelu, 'conv3_2', 256)
+                      .apply(convnormrelu, 'conv3_3', 256)
+                      .MaxPooling('pool3', 2)
+                      # 28
+                      .apply(convnormrelu, 'conv4_1', 512)
+                      .apply(convnormrelu, 'conv4_2', 512)
+                      .apply(convnormrelu, 'conv4_3', 512)
+                      .MaxPooling('pool4', 2)
+                      # 14
+                      .apply(convnormrelu, 'conv5_1', 512)
+                      .apply(convnormrelu, 'conv5_2', 512)
+                      .apply(convnormrelu, 'conv5_3', 512)
+                      .MaxPooling('pool5', 2)
+                      # 7
+                      .FullyConnected('fc6', 4096,
+                          W_init=tf.random_normal_initializer(stddev=0.001))
+                      .tf.nn.relu(name='fc6_relu')
+                      .Dropout('drop0', rate=0.5)
+                      .FullyConnected('fc7', 4096,
+                          W_init=tf.random_normal_initializer(stddev=0.001))
+                      .tf.nn.relu(name='fc7_relu')
+                      .Dropout('drop1', rate=0.5)
+                      .FullyConnected('fc8', 1000,
+                          W_init=tf.random_normal_initializer(stddev=0.01))())
+        add_param_summary(('.*', ['histogram', 'rms']))
+        return logits
+
+def get_data(name, batch):
+    isTrain = name == 'train'
+    global args
+    augmentors = fbresnet_augmentor(isTrain)
+    return get_imagenet_dataflow(args.data, name, batch, augmentors)
+
+
+def get_config():
+    nr_tower = max(get_nr_gpu(), 1)
+    batch = 64
+    total_batch = batch * nr_tower
+    BASE_LR = 0.01 * (total_batch / 256.)
+
+    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
+    dataset_train = get_data('train', batch)
+    dataset_val = get_data('val', batch)
+
+    infs = [ClassificationError('wrong-top1', 'val-error-top1'),
+            ClassificationError('wrong-top5', 'val-error-top5')]
+    callbacks = [
+        ModelSaver(),
+        GPUUtilizationTracker(),
+        EstimatedTimeLeft(),
+        ScheduledHyperParamSetter('learning_rate',
+            [(0, 0.01), (3, max(BASE_LR, 0.01))], interp='linear'),
+        ScheduledHyperParamSetter('learning_rate',
+            [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (80, BASE_LR * 1e-3)]),
+        DataParallelInferenceRunner(
+            dataset_val, infs, list(range(nr_tower))),
+    ]
+
+    input = QueueInput(dataset_train)
+    input = StagingInput(input, nr_stage=1)
+    return TrainConfig(
+        model=Model(),
+        data=input,
+        callbacks=callbacks,
+        steps_per_epoch=1281167 // total_batch,
+        max_epoch=100,
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
+    parser.add_argument('--data', help='ILSVRC dataset dir')
+    parser.add_argument('--norm', choices=['none', 'bn'], default='none')
+    parser.add_argument('--load', help='load model')
+    args = parser.parse_args()
+
+    if args.gpu:
+        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
+
+    logger.set_logger_dir(os.path.join('train_log', 'vgg16'))
+
+    config = get_config()
+    if args.load:
+        config.session_init = get_model_loader(args.load)
+    nr_tower = max(get_nr_gpu(), 1)
+    trainer = SyncMultiGPUTrainerReplicated(nr_tower)
+    launch_train_with_config(config, trainer)
--- a/examples/Inception/imagenet_utils.py
+++ b/examples/Inception/imagenet_utils.py
-../ResNet/imagenet_utils.py
\ No newline at end of file
--- a/examples/ResNet/imagenet_utils.py
+++ b/examples/ResNet/imagenet_utils.py
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# File: imagenet_utils.py
-
-
-import cv2
-import numpy as np
-import multiprocessing
-import tensorflow as tf
-from abc import abstractmethod
-
-from tensorpack import imgaug, dataset, ModelDesc, InputDesc
-from tensorpack.dataflow import (
-    AugmentImageComponent, PrefetchDataZMQ,
-    BatchData, MultiThreadMapData)
-from tensorpack.predict import PredictConfig, SimpleDatasetPredictor
-from tensorpack.utils.stats import RatioCounter
-from tensorpack.models import regularize_cost
-from tensorpack.tfutils.summary import add_moving_summary
-from tensorpack.utils import logger
-
-
-class GoogleNetResize(imgaug.ImageAugmentor):
-    """
-    crop 8%~100% of the original image
-    See `Going Deeper with Convolutions` by Google.
-    """
-    def __init__(self, crop_area_fraction=0.08,
-                 aspect_ratio_low=0.75, aspect_ratio_high=1.333,
-                 target_shape=224):
-        self._init(locals())
-
-    def _augment(self, img, _):
-        h, w = img.shape[:2]
-        area = h * w
-        for _ in range(10):
-            targetArea = self.rng.uniform(self.crop_area_fraction, 1.0) * area
-            aspectR = self.rng.uniform(self.aspect_ratio_low, self.aspect_ratio_high)
-            ww = int(np.sqrt(targetArea * aspectR) + 0.5)
-            hh = int(np.sqrt(targetArea / aspectR) + 0.5)
-            if self.rng.uniform() < 0.5:
-                ww, hh = hh, ww
-            if hh <= h and ww <= w:
-                x1 = 0 if w == ww else self.rng.randint(0, w - ww)
-                y1 = 0 if h == hh else self.rng.randint(0, h - hh)
-                out = img[y1:y1 + hh, x1:x1 + ww]
-                out = cv2.resize(out, (self.target_shape, self.target_shape), interpolation=cv2.INTER_CUBIC)
-                return out
-        out = imgaug.ResizeShortestEdge(self.target_shape, interp=cv2.INTER_CUBIC).augment(img)
-        out = imgaug.CenterCrop(self.target_shape).augment(out)
-        return out
-
-
-def fbresnet_augmentor(isTrain):
-    """
-    Augmentor used in fb.resnet.torch, for BGR images in range [0,255].
-    """
-    if isTrain:
-        augmentors = [
-            GoogleNetResize(),
-            imgaug.RandomOrderAug(      # Remove these augs if your CPU is not fast enough
-                [imgaug.BrightnessScale((0.6, 1.4), clip=False),
-                 imgaug.Contrast((0.6, 1.4), clip=False),
-                 imgaug.Saturation(0.4, rgb=False),
-                 # rgb-bgr conversion for the constants copied from fb.resnet.torch
-                 imgaug.Lighting(0.1,
-                                 eigval=np.asarray(
-                                     [0.2175, 0.0188, 0.0045][::-1]) * 255.0,
-                                 eigvec=np.array(
-                                     [[-0.5675, 0.7192, 0.4009],
-                                      [-0.5808, -0.0045, -0.8140],
-                                      [-0.5836, -0.6948, 0.4203]],
-                                     dtype='float32')[::-1, ::-1]
-                                 )]),
-            imgaug.Flip(horiz=True),
-        ]
-    else:
-        augmentors = [
-            imgaug.ResizeShortestEdge(256, cv2.INTER_CUBIC),
-            imgaug.CenterCrop((224, 224)),
-        ]
-    return augmentors
-
-
-def get_imagenet_dataflow(
-        datadir, name, batch_size,
-        augmentors, parallel=None):
-    """
-    See explanations in the tutorial:
-    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
-    """
-    assert name in ['train', 'val', 'test']
-    assert datadir is not None
-    assert isinstance(augmentors, list)
-    isTrain = name == 'train'
-    if parallel is None:
-        parallel = min(40, multiprocessing.cpu_count() // 2)  # assuming hyperthreading
-    if isTrain:
-        ds = dataset.ILSVRC12(datadir, name, shuffle=True)
-        ds = AugmentImageComponent(ds, augmentors, copy=False)
-        if parallel < 16:
-            logger.warn("DataFlow may become the bottleneck when too few processes are used.")
-        ds = PrefetchDataZMQ(ds, parallel)
-        ds = BatchData(ds, batch_size, remainder=False)
-    else:
-        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
-        aug = imgaug.AugmentorList(augmentors)
-
-        def mapf(dp):
-            fname, cls = dp
-            im = cv2.imread(fname, cv2.IMREAD_COLOR)
-            im = aug.augment(im)
-            return im, cls
-        ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True)
-        ds = BatchData(ds, batch_size, remainder=True)
-        ds = PrefetchDataZMQ(ds, 1)
-    return ds
-
-
-def eval_on_ILSVRC12(model, sessinit, dataflow):
-    pred_config = PredictConfig(
-        model=model,
-        session_init=sessinit,
-        input_names=['input', 'label'],
-        output_names=['wrong-top1', 'wrong-top5']
-    )
-    pred = SimpleDatasetPredictor(pred_config, dataflow)
-    acc1, acc5 = RatioCounter(), RatioCounter()
-    for top1, top5 in pred.get_result():
-        batch_size = top1.shape[0]
-        acc1.feed(top1.sum(), batch_size)
-        acc5.feed(top5.sum(), batch_size)
-    print("Top1 Error: {}".format(acc1.ratio))
-    print("Top5 Error: {}".format(acc5.ratio))
-
-
-class ImageNetModel(ModelDesc):
-    weight_decay = 1e-4
-    image_shape = 224
-
-    """
-    uint8 instead of float32 is used as input type to reduce copy overhead.
-    It might hurt the performance a liiiitle bit.
-    The pretrained models were trained with float32.
-    """
-    image_dtype = tf.uint8
-
-    def __init__(self, data_format='NCHW'):
-        self.data_format = data_format
-
-    def _get_inputs(self):
-        return [InputDesc(self.image_dtype, [None, self.image_shape, self.image_shape, 3], 'input'),
-                InputDesc(tf.int32, [None], 'label')]
-
-    def _build_graph(self, inputs):
-        image, label = inputs
-        image = ImageNetModel.image_preprocess(image, bgr=True)
-        if self.data_format == 'NCHW':
-            image = tf.transpose(image, [0, 3, 1, 2])
-
-        logits = self.get_logits(image)
-        loss = ImageNetModel.compute_loss_and_error(logits, label)
-
-        if self.weight_decay > 0:
-            wd_loss = regularize_cost('.*/W', tf.contrib.layers.l2_regularizer(self.weight_decay),
-                                      name='l2_regularize_loss')
-            add_moving_summary(loss, wd_loss)
-            self.cost = tf.add_n([loss, wd_loss], name='cost')
-        else:
-            self.cost = tf.identity(loss, name='cost')
-            add_moving_summary(self.cost)
-
-    @abstractmethod
-    def get_logits(self, image):
-        """
-        Args:
-            image: 4D tensor of 224x224 in ``self.data_format``
-
-        Returns:
-            Nx1000 logits
-        """
-
-    def _get_optimizer(self):
-        lr = tf.get_variable('learning_rate', initializer=0.1, trainable=False)
-        tf.summary.scalar('learning_rate-summary', lr)
-        return tf.train.MomentumOptimizer(lr, 0.9, use_nesterov=True)
-
-    @staticmethod
-    def image_preprocess(image, bgr=True):
-        with tf.name_scope('image_preprocess'):
-            if image.dtype.base_dtype != tf.float32:
-                image = tf.cast(image, tf.float32)
-            image = image * (1.0 / 255)
-
-            mean = [0.485, 0.456, 0.406]    # rgb
-            std = [0.229, 0.224, 0.225]
-            if bgr:
-                mean = mean[::-1]
-                std = std[::-1]
-            image_mean = tf.constant(mean, dtype=tf.float32)
-            image_std = tf.constant(std, dtype=tf.float32)
-            image = (image - image_mean) / image_std
-            return image
-
-    @staticmethod
-    def compute_loss_and_error(logits, label):
-        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
-        loss = tf.reduce_mean(loss, name='xentropy-loss')
-
-        def prediction_incorrect(logits, label, topk=1, name='incorrect_vector'):
-            with tf.name_scope('prediction_incorrect'):
-                x = tf.logical_not(tf.nn.in_top_k(logits, label, topk))
-            return tf.cast(x, tf.float32, name=name)
-
-        wrong = prediction_incorrect(logits, label, 1, name='wrong-top1')
-        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))
-
-        wrong = prediction_incorrect(logits, label, 5, name='wrong-top5')
-        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5'))
-        return loss
-
-
-if __name__ == '__main__':
-    import argparse
-    from tensorpack.dataflow import TestDataSpeed
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--data', required=True)
-    parser.add_argument('--batch', type=int, default=32)
-    parser.add_argument('--aug', choices=['train', 'val'], default='val')
-    args = parser.parse_args()
-
-    if args.aug == 'val':
-        augs = fbresnet_augmentor(False)
-    elif args.aug == 'train':
-        augs = fbresnet_augmentor(True)
-    df = get_imagenet_dataflow(
-        args.data, 'train', args.batch, augs)
-    # For val augmentor, Should get >100 it/s (i.e. 3k im/s) here on a decent E5 server.
-    TestDataSpeed(df).start()
--- a/examples/ResNet/imagenet_utils.py
+++ b/examples/ResNet/imagenet_utils.py
+../ImageNetModels/imagenet_utils.py
\ No newline at end of file
--- a/examples/ShuffleNet/README.md
+++ b/examples/ShuffleNet/README.md

-## ShuffleNet
-
-Reproduce [ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices](https://arxiv.org/abs/1707.01083)
-on ImageNet.
-
-This is a 38Mflops ShuffleNet, corresponding to `ShuffleNet 0.5x g=3` in [version 2](https://arxiv.org/pdf/1707.01083v2) of the paper.
-After 240 epochs it reaches top-1 error of 42.32, better than the paper's number.
-
-### Usage:
-
-Print flops with tensorflow:
-```bash
-./shufflenet.py --flops
-```
-It will print about 75Mflops, because the paper counts multiply+add as 1 flop.
-
-Train (takes 36 hours on 8 P100s):
-```bash
-./shufflenet.py --data /path/to/ilsvrc/
-```
-
-Evaluate the [pretrained model](http://models.tensorpack.com/ShuffleNet/):
-```
-./shufflenet.py --eval --data /path/to/ilsvrc --load /path/to/model
-```
+Moved to [../ImageNetModels](../ImageNetModels/).
--- a/examples/ShuffleNet/imagenet_utils.py
+++ b/examples/ShuffleNet/imagenet_utils.py
-../ResNet/imagenet_utils.py
\ No newline at end of file