imagenet resnet

bcf8dbfe · Yuxin Wu · e309e627 · bcf8dbfe · bcf8dbfe · bcf8dbfe
Commit bcf8dbfe authored Oct 29, 2016 by Yuxin Wu
4 changed files
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ See some [examples](examples) to learn about the framework.
 You can actually train them and reproduce the performance... not just to see how to write code.
 + [DoReFa-Net: training binary / low bitwidth CNN](examples/DoReFa-Net)
-+ [IncpetionV3 on ImageNet](examples/Inception/inceptionv3.py)
+ [InceptionV3 on ImageNet](examples/Inception/inceptionv3.py)
 + [ResNet for Cifar10 classification](examples/ResNet)
 + [Fully-convolutional Network for Holistically-Nested Edge Detection](examples/HED)
 + [Spatial Transformer Networks on MNIST addition](examples/SpatialTransformer)

--- a/examples/ResNet/imagenet-resnet.py
+++ b/examples/ResNet/imagenet-resnet.py
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+# File: imagenet-resnet.py
+# Author: Yuxin Wu <ppwwyyxx@gmail.com>
+import cv2
+import argparse
+import numpy as np
+import os
+import multiprocessing
+import tensorflow as tf
+from tensorflow.contrib.layers import variance_scaling_initializer
+from tensorpack import *
+from tensorpack.tfutils.symbolic_functions import *
+from tensorpack.tfutils.summary import *
+"""
+Training code of ResNet on ImageNet. Work In Progress.
+Top1 error is now about 0.5% higher than fb.resnet.torch.
+"""
+NR_GPU = 4
+TOTAL_BATCH_SIZE = 256
+BATCH_SIZE = TOTAL_BATCH_SIZE / NR_GPU
+INPUT_SHAPE = 224
+class Model(ModelDesc):
+    def _get_input_vars(self):
+        return [InputVar(tf.float32, [None, INPUT_SHAPE, INPUT_SHAPE, 3], 'input'),
+                InputVar(tf.int32, [None], 'label') ]
+    def _build_graph(self, input_vars):
+        image, label = input_vars
+        def shortcut(l, n_in, n_out, stride):
+            if n_in != n_out:
+                return Conv2D('convshortcut', l, n_out, 1, stride=stride)
+            else:
+                return l
+        def basicblock(l, ch_out, stride, preact):
+            ch_in = l.get_shape().as_list()[-1]
+            input = l
+            if preact == 'both_preact':
+                l = BatchNorm('preact', l)
+                l = tf.nn.relu(l, name='preact-relu')
+                input = l
+            elif preact != 'no_preact':
+                l = BatchNorm('preact', l)
+                l = tf.nn.relu(l, name='preact-relu')
+            l = Conv2D('conv1', l, ch_out, 3, stride=stride)
+            l = BatchNorm('bn', l)
+            l = tf.nn.relu(l)
+            l = Conv2D('conv2', l, ch_out, 3)
+            return l + shortcut(input, ch_in, ch_out, stride)
+        def bottleneck(l, ch_out, stride, preact):
+            ch_in = l.get_shape().as_list()[-1]
+            input = l
+            if preact == 'both_preact':
+                l = BatchNorm('preact', l)
+                l = tf.nn.relu(l, name='preact-relu')
+                input = l
+            elif preact != 'no_preact':
+                l = BatchNorm('preact', l)
+                l = tf.nn.relu(l, name='preact-relu')
+            l = Conv2D('conv1', l, ch_out, 1)
+            l = BatchNorm('bn1', l)
+            l = tf.nn.relu(l)
+            l = Conv2D('conv2', l, ch_out, 3, stride=stride)
+            l = BatchNorm('bn2', l)
+            l = tf.nn.relu(l)
+            l = Conv2D('conv3', l, ch_out * 4, 1)
+            return l + shortcut(input, ch_in, ch_out * 4, stride)
+        def layer(l, layername, block_func, features, count, stride, first=False):
+            with tf.variable_scope(layername):
+                with tf.variable_scope('block0'):
+                    l = block_func(l, features, stride,
+                            'no_preact' if first else 'both_preact')
+                for i in range(1, count):
+                    with tf.variable_scope('block{}'.format(i)):
+                        l = block_func(l, features, 1, 'default')
+                return l
+        cfg = {
+            18: ([2,2,2,2], basicblock),
+            34: ([3,4,6,3], basicblock),
+            50: ([3,4,6,3], bottleneck),
+            101: ([3,4,23,3], bottleneck)
+        }
+        defs, block_func = cfg[50]
+        with argscope(Conv2D, nl=tf.identity, use_bias=False,
+                W_init=variance_scaling_initializer(mode='FAN_OUT')):
+            logits = (LinearWrap(image)
+                .Conv2D('conv0', 64, 7, stride=2, nl=BNReLU)
+                .MaxPooling('pool0', shape=3, stride=2, padding='SAME')
+                .apply(layer, 'group0', block_func, 64, defs[0], 1, first=True)
+                .apply(layer, 'group1', block_func, 128, defs[1], 2)
+                .apply(layer, 'group2', block_func, 256, defs[2], 2)
+                .apply(layer, 'group3', block_func, 512, defs[3], 2)
+                .BatchNorm('bnlast')
+                .tf.nn.relu()
+                .GlobalAvgPooling('gap')
+                .FullyConnected('linear', 1000, nl=tf.identity)())
+        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label)
+        loss = tf.reduce_mean(loss, name='xentropy-loss')
+        wrong = prediction_incorrect(logits, label, 1)
+        nr_wrong = tf.reduce_sum(wrong, name='wrong-top1')
+        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top1'))
+        wrong = prediction_incorrect(logits, label, 5)
+        nr_wrong = tf.reduce_sum(wrong, name='wrong-top5')
+        add_moving_summary(tf.reduce_mean(wrong, name='train-error-top5'))
+        # weight decay on all W of fc layers
+        wd_w = tf.train.exponential_decay(1e-4, get_global_step_var(),
+                                          200000, 0.7, True)
+        wd_w = wd_w / tf.get_default_graph().get_tensor_by_name('learning_rate')
+        wd_cost = tf.mul(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='l2_regularize_loss')
+        add_moving_summary(loss, wd_cost)
+        self.cost = tf.add_n([loss, wd_cost], name='cost')
+def get_data(train_or_test):
+    isTrain = train_or_test == 'train'
+    datadir = args.data
+    ds = dataset.ILSVRC12(datadir, train_or_test,
+            shuffle=True if isTrain else False, dir_structure='original')
+    image_mean = np.array([0.485, 0.456, 0.406], dtype='float32')
+    image_std = np.array([0.229, 0.224, 0.225], dtype='float32')
+    if isTrain:
+        class Resize(imgaug.ImageAugmentor):
+            def __init__(self):
+                self._init(locals())
+            def _augment(self, img, _):
+                # fbaug
+                h, w = img.shape[:2]
+                area = h * w
+                for _ in range(10):
+                    targetArea = self.rng.uniform(0.08, 1.0) * area
+                    aspectR = self.rng.uniform(0.75,1.333)
+                    ww = int(np.sqrt(targetArea * aspectR))
+                    hh = int(np.sqrt(targetArea / aspectR))
+                    if self.rng.uniform() < 0.5:
+                        ww, hh = hh, ww
+                    if hh <= h and ww <= w:
+                        x1 = 0 if w == ww else self.rng.randint(0, w - ww)
+                        y1 = 0 if h == hh else self.rng.randint(0, h - hh)
+                        out = img[y1:y1+hh,x1:x1+ww]
+                        out = cv2.resize(out, (224,224), interpolation=cv2.INTER_CUBIC)
+                        return out
+                out = cv2.resize(img, (224,224), interpolation=cv2.INTER_CUBIC)
+                return out
+        augmentors = [
+            Resize(),
+            imgaug.RandomOrderAug(
+                [imgaug.Brightness(30, clip=False),
+                 imgaug.Gamma(),
+                 imgaug.Contrast((0.8, 1.2), clip=False),
+                 imgaug.Saturation(0.4)]),
+            imgaug.Clip(),
+            imgaug.Flip(horiz=True),
+            imgaug.MapImage(lambda x: (x * (1.0 / 255) - image_mean) / image_std),
+        ]
+    else:
+        def resize_func(im):
+            h, w = im.shape[:2]
+            scale = 256.0 / min(h, w)
+            desSize = map(int, [scale * w, scale * h])
+            im = cv2.resize(im, tuple(desSize), interpolation=cv2.INTER_CUBIC)
+            return im
+        augmentors = [
+            imgaug.MapImage(resize_func),
+            imgaug.CenterCrop((224, 224)),
+            imgaug.MapImage(lambda x: (x * (1.0 / 255) - image_mean) / image_std),
+        ]
+    ds = AugmentImageComponent(ds, augmentors)
+    ds = BatchData(ds, BATCH_SIZE, remainder=not isTrain)
+    if isTrain:
+        ds = PrefetchDataZMQ(ds, min(12, multiprocessing.cpu_count()))
+    return ds
+def get_config():
+    # prepare dataset
+    dataset_train = get_data('train')
+    dataset_val = get_data('val')
+    sess_config = get_default_sess_config(0.99)
+    lr = tf.Variable(0.1, trainable=False, name='learning_rate')
+    tf.scalar_summary('learning_rate', lr)
+    return TrainConfig(
+        dataset=dataset_train,
+        optimizer=tf.train.MomentumOptimizer(lr, 0.9, use_nesterov=True),
+        callbacks=Callbacks([
+            StatPrinter(), ModelSaver(),
+            InferenceRunner(dataset_val, [
+                ClassificationError('wrong-top1', 'val-error-top1'),
+                ClassificationError('wrong-top5', 'val-error-top5')]),
+            ScheduledHyperParamSetter('learning_rate',
+                                      [(30, 1e-2), (60, 1e-3), (85, 2e-4)]),
+            HumanHyperParamSetter('learning_rate'),
+        ]),
+        session_config=sess_config,
+        model=Model(),
+        step_per_epoch=5000,
+        max_epoch=110,
+    )
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.') # nargs='*' in multi mode
+    parser.add_argument('--data', help='ILSVRC dataset dir')
+    parser.add_argument('--load', help='load model')
+    args = parser.parse_args()
+    logger.auto_set_dir()
+    if args.gpu:
+        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
+    config = get_config()
+    if args.load:
+        config.session_init = SaverRestore(args.load)
+    if args.gpu:
+        config.nr_tower = len(args.gpu.split(','))
+    SyncMultiGPUTrainer(config).train()
--- a/tensorpack/dataflow/dataset/ilsvrc.py
+++ b/tensorpack/dataflow/dataset/ilsvrc.py
@@ -97,12 +97,12 @@ class ILSVRC12(RNGDataFlow):
            original ILSVRC12_`name`.tar gets decompressed.
        :param name: 'train' or 'val' or 'test'
        :param dir_structure: The dir structure of 'val' and 'test'.
-            If is 'original' then keep the original decompressed dir with list
+            If is 'original' then keep the original decompressed directory with list
-            of image files (as below). If equals to 'train', use the `train/` dir
+            of image files (as below). If set to 'train', use the the same
-            structure with class name as subdirectories.
+            directory structure as 'train/', with class name as subdirectories.
        :param include_bb: Include the bounding box. Maybe useful in training.
-        Dir should have the following structure:
+        When `dir_structure=='original'`, `dir` should have the following structure:
        .. code-block:: none
@@ -128,6 +128,7 @@ class ILSVRC12(RNGDataFlow):
        command to build the above structure for `train/`:
        .. code-block:: none
            tar xvf ILSVRC12_img_train.tar -C train && cd train
            find -type f -name '*.tar' | parallel -P 10 'echo {} && mkdir -p {/.} && tar xf {} -C {/.}'
            Or:

--- a/tensorpack/dataflow/dataset/svhn.py
+++ b/tensorpack/dataflow/dataset/svhn.py
@@ -22,10 +22,10 @@ SVHN_URL = "http://ufldl.stanford.edu/housenumbers/"
 class SVHNDigit(RNGDataFlow):
    """
-    SVHN Cropped Digit Dataset
+    SVHN Cropped Digit Dataset.
    return img of 32x32x3, label of 0-9
    """
-    Cache = {}
+    _Cache = {}
    def __init__(self, name, data_dir=None, shuffle=True):
        """
@@ -34,8 +34,8 @@ class SVHNDigit(RNGDataFlow):
        """
        self.shuffle = shuffle
-        if name in SVHNDigit.Cache:
+        if name in SVHNDigit._Cache:
-            self.X, self.Y = SVHNDigit.Cache[name]
+            self.X, self.Y = SVHNDigit._Cache[name]
            return
        if data_dir is None:
            data_dir = get_dataset_path('svhn_data')
@@ -48,7 +48,7 @@ class SVHNDigit(RNGDataFlow):
        self.X = data['X'].transpose(3,0,1,2)
        self.Y = data['y'].reshape((-1))
        self.Y[self.Y==10] = 0
-        SVHNDigit.Cache[name] = (self.X, self.Y)
+        SVHNDigit._Cache[name] = (self.X, self.Y)
    def size(self):
        return self.X.shape[0]