update alexnet training script

82036227 · Yuxin Wu · 375123f5 · 82036227 · 82036227 · 82036227
Commit 82036227 authored Jul 20, 2016 by Yuxin Wu
4 changed files
--- a/examples/DoReFa-Net/README.md
+++ b/examples/DoReFa-Net/README.md
@@ -2,11 +2,10 @@ This is the official script to train, or run pretrained model for the paper:

 [DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients](http://arxiv.org/abs/1606.06160), by Zhou et al.

-We are hosting a demo at CVPR16 on behalf of Megvii, Inc.
+We hosted a demo at CVPR16 on behalf of Megvii, Inc, running real-time DoReFa-Net on both ARM and FPGA.
+But we're not planning to release the runtime bit-op library.

-Training code for SVHN is available.
-
-The provided pretrained model is an AlexNet with 1 bit weights, 2 bit activations, trained with 4 bit gradients.
+Pretrained model for 1-2-6-AlexNet will be available shortly.

 ## Preparation:

@@ -21,47 +20,23 @@ To use the script. You'll need:
 ```
 git clone https://github.com/ppwwyyxx/tensorpack
 pip install --user -r tensorpack/requirements.txt
-export PYTHONPATH=$PYTHONPATH:`readlink -f tensorpack`
-```
-
-+ To perform training, you'll also need [pyzmq](https://github.com/zeromq/pyzmq) and [scipy](https://www.scipy.org/):
-```
 pip install --user pyzmq scipy
+export PYTHONPATH=$PYTHONPATH:`readlink -f tensorpack`
 ```

-+ Pretrained model is hosted at [google drive](https://drive.google.com/open?id=0B308TeQzmFDLa0xOeVQwcXg1ZjQ)
-
-## Load and run the model
-We published the model in two file formats:
-
-+ `alexnet.npy`. It's simply a dict of {param name: value}.
-You can load it with `np.load('alexnet.npy').item()` for other purposes.
-Run the model with:
-
-```
-./alexnet.py --load alexnet.npy [--input img.jpg] [--data path/to/data]
-```
-
-+ `alexnet.meta` + `alexnet.tfmodel`. A TensorFlow MetaGraph proto and a saved checkpoint.
-
-```
-./alexnet.py --graph alexnet.meta --load alexnet.tfmodel [--input path/to/img.jpg] [--data path/to/ILSVRC12]
-```
-
-In both cases, one of `--data` or `--input` must be present, to either run classification on some input images, or run evaluation on ILSVRC12 validation set.
-To eval on ILSVRC12, `path/to/ILSVRC12` must have a subdirectory named 'val' containing all the validation images.
+ Look at the docstring in `svhn-digit-dorefa.py` or `alexnet-dorefa.py` to see detailed usage and performance.

 ## Support

 Please use [github issues](https://github.com/ppwwyyxx/tensorpack/issues) for any issues related to the code.
-Send email to the authors for other questions related to the paper.
+Send email to the authors for general questions related to the paper.

 ## Citation

 If you use our code or models in your research, please cite:
 ```
 @article{zhou2016dorefa,
-  author    = {Shuchang Zhou and Zekun Ni and Xinyu Zhou and He Wen and Yuxin Wu and Yuheng Zou},
+  author    = {Shuchang Zhou and Yuxin Wu and Zekun Ni and Xinyu Zhou and He Wen and Yuheng Zou},
  title     = {DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients},
  journal   = {CoRR},
  volume    = {abs/1606.06160},

--- a/examples/DoReFa-Net/alexnet-dorefa.py
+++ b/examples/DoReFa-Net/alexnet-dorefa.py
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+# File: alexnet-dorefa.py
+# Author: Yuxin Wu <ppwwyyxx@gmail.com>
+
+import cv2
+import tensorflow as tf
+import argparse
+import numpy as np
+import multiprocessing
+import msgpack
+import os
+
+from tensorpack import *
+from tensorpack.tfutils.symbolic_functions import *
+from tensorpack.tfutils.summary import *
+from dorefa import get_dorefa
+
+"""
+This is a tensorpack script for the ImageNet results in paper:
+DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients
+http://arxiv.org/abs/1606.06160
+
+The original experiements are performed on a proprietary framework.
+This is our attempt to reproduce it on tensorpack/tensorflow.
+
+Accuracy:
+    Trained with 4 GPUs and (W,A,G)=(1,2,6), it can reach top-1 single-crop validation error of 52%,
+    after 64 epochs. The number is a bit better than what's in the paper
+    probably due to more sophisticated augmentors.
+
+    Note that the effective batch size in SyncMultiGPUTrainer is actually
+    BATCH_SIZE * NUM_GPU. With a different number of GPUs in use, things might
+    be a bit different, especially for learning rate.
+
+Speed:
+    About 3.5 iteration/s on 4 Tesla M40. (Each epoch is set to 10000 iterations)
+
+To Run:
+    ./alexnet-dorefa.py --dorefa 1,2,6 --data PATH --gpu 0,1,2,3
+
+    PATH should look like:
+    PATH/
+      train/
+        n02134418/
+          n02134418_198.JPEG
+          ...
+        ...
+      val/
+        ILSVRC2012_val_00000001.JPEG
+        ...
+
+    And better to have:
+        Fast disk random access (Not necessarily SSD. I used a RAID of HDD, but not sure if plain HDD is enough)
+        More than 12 CPU cores (for data processing)
+"""
+
+BITW = 1
+BITA = 2
+BITG = 4
+BATCH_SIZE = 32
+
+class Model(ModelDesc):
+    def _get_input_vars(self):
+        return [InputVar(tf.float32, [None, 224, 224, 3], 'input'),
+                InputVar(tf.int32, [None], 'label') ]
+
+    def _build_graph(self, input_vars, is_training):
+        image, label = input_vars
+        image = image / 255.0
+
+        fw, fa, fg = get_dorefa(BITW, BITA, BITG)
+        # monkey-patch tf.get_variable to apply fw
+        old_get_variable = tf.get_variable
+        def new_get_variable(name, shape=None, **kwargs):
+            v = old_get_variable(name, shape, **kwargs)
+            # don't binarize first and last layer
+            if name != 'W' or 'conv0' in v.op.name or 'fct' in v.op.name:
+                return v
+            else:
+                logger.info("Binarizing weight {}".format(v.op.name))
+                return fw(v)
+        tf.get_variable = new_get_variable
+
+        def nonlin(x):
+            if BITA == 32:
+                return tf.nn.relu(x)    # still use relu for 32bit cases
+            return tf.clip_by_value(x, 0.0, 1.0)
+
+        def activate(x):
+            return fa(nonlin(x))
+
+        with argscope(BatchNorm, decay=0.9, epsilon=1e-4, use_local_stat=is_training), \
+                argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity):
+            logits = (LinearWrap(image)
+                .Conv2D('conv0', 96, 12, stride=4, padding='VALID')
+                .apply(activate)
+
+                .Conv2D('conv1', 256, 5, padding='SAME', split=2)
+                .apply(fg)
+                .BatchNorm('bn1')
+                .MaxPooling('pool1', 3, 2, padding='SAME')
+                .apply(activate)
+
+                .Conv2D('conv2', 384, 3)
+                .apply(fg)
+                .BatchNorm('bn2')
+                .MaxPooling('pool2', 3, 2, padding='SAME')
+                .apply(activate)
+
+                .Conv2D('conv3', 384, 3, split=2)
+                .apply(fg)
+                .BatchNorm('bn3')
+                .apply(activate)
+
+                .Conv2D('conv4', 256, 3, split=2)
+                .apply(fg)
+                .BatchNorm('bn4')
+                .MaxPooling('pool4', 3, 2, padding='VALID')
+                .apply(activate)
+
+                .FullyConnected('fc0', 4096)
+                .apply(fg)
+                .BatchNorm('bnfc0')
+                .apply(activate)
+
+                .FullyConnected('fc1', 4096)
+                .apply(fg)
+                .BatchNorm('bnfc1')
+                .apply(nonlin)
+                .FullyConnected('fct', 1000, use_bias=True)())
+        tf.get_variable = old_get_variable
+
+        prob = tf.nn.softmax(logits, name='output')
+
+        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, label)
+        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
+
+        wrong = prediction_incorrect(logits, label, 1)
+        nr_wrong = tf.reduce_sum(wrong, name='wrong-top1')
+        add_moving_summary(tf.reduce_mean(wrong, name='train_error_top1'))
+        wrong = prediction_incorrect(logits, label, 5)
+        nr_wrong = tf.reduce_sum(wrong, name='wrong-top5')
+        add_moving_summary(tf.reduce_mean(wrong, name='train_error_top5'))
+
+        # weight decay on all W of fc layers
+        wd_cost = regularize_cost('fc.*/W', l2_regularizer(5e-6))
+        add_moving_summary(cost, wd_cost)
+
+        add_param_summary([('.*/W', ['histogram', 'rms'])])
+        self.cost = tf.add_n([cost, wd_cost], name='cost')
+
+def get_data(dataset_name):
+    isTrain = dataset_name == 'train'
+    ds = dataset.ILSVRC12(args.data, dataset_name,
+            shuffle=True if isTrain else False)
+
+    meta = dataset.ILSVRCMeta()
+    pp_mean = meta.get_per_pixel_mean()
+    pp_mean_224 = pp_mean[16:-16,16:-16,:]
+
+    if isTrain:
+        class Resize(imgaug.ImageAugmentor):
+            def __init__(self):
+                self._init(locals())
+            def _augment(self, img, _):
+                h, w = img.shape[:2]
+                size = 224
+                scale = self.rng.randint(size, 308) * 1.0 / min(h, w)
+                scaleX = scale * self.rng.uniform(0.85, 1.15)
+                scaleY = scale * self.rng.uniform(0.85, 1.15)
+                desSize = map(int, (max(size, min(w, scaleX * w)),\
+                    max(size, min(h, scaleY * h))))
+                dst = cv2.resize(img, tuple(desSize),
+                     interpolation=cv2.INTER_CUBIC)
+                return dst
+
+        augmentors = [
+            Resize(),
+            imgaug.Rotation(max_deg=10),
+            imgaug.RandomApplyAug(imgaug.GaussianBlur(3), 0.5),
+            imgaug.Brightness(30, True),
+            imgaug.Gamma(),
+            imgaug.Contrast((0.8,1.2), True),
+            imgaug.RandomCrop((224, 224)),
+            imgaug.RandomApplyAug(imgaug.JpegNoise(), 0.8),
+            imgaug.RandomApplyAug(imgaug.GaussianDeform(
+                [(0.2, 0.2), (0.2, 0.8), (0.8,0.8), (0.8,0.2)],
+                (224, 224), 0.2, 3), 0.1),
+            imgaug.Flip(horiz=True),
+            imgaug.MapImage(lambda x: x - pp_mean_224),
+        ]
+    else:
+        def resize_func(im):
+            h, w = im.shape[:2]
+            scale = 256.0 / min(h, w)
+            desSize = map(int, (max(224, min(w, scale * w)),\
+                                max(224, min(h, scale * h))))
+            im = cv2.resize(im, tuple(desSize), interpolation=cv2.INTER_CUBIC)
+            return im
+        augmentors = [
+            imgaug.MapImage(resize_func),
+            imgaug.CenterCrop((224, 224)),
+            imgaug.MapImage(lambda x: x - pp_mean_224),
+        ]
+    ds = AugmentImageComponent(ds, augmentors)
+    ds = BatchData(ds, BATCH_SIZE, remainder=not isTrain)
+    if isTrain:
+        ds = PrefetchDataZMQ(ds, min(12, multiprocessing.cpu_count()))
+    return ds
+
+def get_config():
+    logger.auto_set_dir()
+
+    # prepare dataset
+    data_train = get_data('train')
+    data_test = get_data('val')
+
+    lr = tf.Variable(1e-4, trainable=False, name='learning_rate')
+    tf.scalar_summary('learning_rate', lr)
+
+    return TrainConfig(
+        dataset=data_train,
+        optimizer=tf.train.AdamOptimizer(lr, epsilon=1e-5),
+        callbacks=Callbacks([
+            StatPrinter(),
+            ModelSaver(),
+            #HumanHyperParamSetter('learning_rate'),
+            ScheduledHyperParamSetter(
+                'learning_rate', [(56, 2e-5), (64, 4e-6)]),
+            InferenceRunner(data_test,
+                [ScalarStats('cost'),
+                 ClassificationError('wrong-top1', 'val-top1-error'),
+                 ClassificationError('wrong-top5', 'val-top5-error')])
+        ]),
+        model=Model(),
+        step_per_epoch=10000,
+        max_epoch=100,
+    )
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--gpu', help='the physical ids of GPUs to use')
+    parser.add_argument('--load', help='load a checkpoint')
+    parser.add_argument('--data', help='ILSVRC dataset dir')
+    parser.add_argument('--dorefa',
+            help='number of bits for W,A,G, separated by comma. Defaults to \'1,2,4\'',
+            default='1,2,4')
+    args = parser.parse_args()
+
+    BITW, BITA, BITG = map(int, args.dorefa.split(','))
+
+    if args.gpu:
+        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
+
+    config = get_config()
+    if args.load:
+        config.session_init = SaverRestore(args.load)
+    if args.gpu:
+        config.nr_tower = len(args.gpu.split(','))
+    SyncMultiGPUTrainer(config).train()
--- a/examples/DoReFa-Net/dorefa.py
+++ b/examples/DoReFa-Net/dorefa.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# File: dorefa.py
+# Author: Yuxin Wu <ppwwyyxxc@gmail.com>
+
+import tensorflow as tf
+
+# just a hack to avoid repeatedly registering the gradient
+GRAD_DEFINED = False
+
+def get_dorefa(bitW, bitA, bitG):
+    """ return the three quantization functions fw, fa, fg, for weights,
+    activations and gradients respectively"""
+    G = tf.get_default_graph()
+
+    def quantize(x, k):
+        n = float(2**k-1)
+        with G.gradient_override_map({"Floor": "Identity"}):
+            return tf.round(x * n) / n
+
+    def fw(x):
+        if bitW == 32:
+            return x
+        if bitW == 1:   # BWN
+            with G.gradient_override_map({"Sign": "Identity"}):
+                E = tf.stop_gradient(tf.reduce_mean(tf.abs(x)))
+                return tf.sign(x / E) * E
+        x = tf.tanh(x)
+        x = x / tf.reduce_max(tf.abs(x)) * 0.5 + 0.5
+        return 2 * quantize(x, bitW) - 1
+
+    def fa(x):
+        if bitA == 32:
+            return x
+        return quantize(x, bitA)
+
+    global GRAD_DEFINED
+    if not GRAD_DEFINED:
+        @tf.RegisterGradient("FGGrad")
+        def grad_fg(op, x):
+            rank = x.get_shape().ndims
+            assert rank is not None
+            maxx = tf.reduce_max(tf.abs(x), list(range(1,rank)), keep_dims=True)
+            x = x / maxx
+            n = float(2**bitG-1)
+            x = x * 0.5 + 0.5 + tf.random_uniform(
+                    tf.shape(x), minval=-0.5/n, maxval=0.5/n)
+            x = tf.clip_by_value(x, 0.0, 1.0)
+            x = quantize(x, bitG) - 0.5
+            return x * maxx * 2
+    GRAD_DEFINED = True
+
+    def fg(x):
+        if bitG == 32:
+            return x
+        with G.gradient_override_map({"Identity": "FGGrad"}):
+            return tf.identity(x)
+    return fw, fa, fg
+
--- a/examples/DoReFa-Net/svhn-digit-dorefa.py
+++ b/examples/DoReFa-Net/svhn-digit-dorefa.py
@@ -11,79 +11,35 @@ import os
 from tensorpack import *
 from tensorpack.tfutils.symbolic_functions import *
 from tensorpack.tfutils.summary import *
+from dorefa import get_dorefa

 """
-Code for the paper:
+This is a tensorpack script for the SVHN results in paper:
 DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients
 http://arxiv.org/abs/1606.06160

 The original experiements are performed on a proprietary framework.
-This is our attempt to reproduce it on tensorpack.
+This is our attempt to reproduce it on tensorpack/tensorflow.

-You'll need tcmalloc to avoid large memory consumption: https://github.com/tensorflow/tensorflow/issues/2942
+Accuracy:
+    With (W,A,G)=(1,1,4), can reach 3.1~3.2% error after 150 epochs.
+    With the GaussianDeform augmentor, it will reach 2.8~2.9%
+    (we are not using this augmentor in the paper).

-This config, with (W,A,G)=(1,1,4), can reach 3.1~3.2% error after 150 epochs.
-With the GaussianDeform augmentor, it will reach 2.8~2.9%
-(we are not using this augmentor in the paper).
+    With (W,A,G)=(1,2,4), error is 3.0~3.1%.
+    With (W,A,G)=(32,32,32), error is about 2.9%.

-with (W,A,G)=(1,2,4), error is 3.0~3.1%.
-with (W,A,G)=(32,32,32), error is about 2.9%.
+Speed:
+    About 18 iteration/s on 1 Tesla M40. (4721 iterations / epoch)
+
+To Run:
+    ./svhn-digit-dorefa.py --dorefa 1,2,4
 """

 BITW = 1
 BITA = 2
 BITG = 4

-GRAD_DEFINED = False
-def get_dorefa(bitW, bitA, bitG):
-    """ return the three quantization functions fw, fa, fg, for weights,
-    activations and gradients respectively"""
-    G = tf.get_default_graph()
-
-    def quantize(x, k):
-        n = float(2**k-1)
-        with G.gradient_override_map({"Floor": "Identity"}):
-            return tf.round(x * n) / n
-
-    def fw(x):
-        if bitW == 32:
-            return x
-        if bitW == 1:   # BWN
-            with G.gradient_override_map({"Sign": "Identity"}):
-                E = tf.stop_gradient(tf.reduce_mean(tf.abs(x)))
-                return tf.sign(x / E) * E
-        x = tf.tanh(x)
-        x = x / tf.reduce_max(tf.abs(x)) * 0.5 + 0.5
-        return 2 * quantize(x, bitW) - 1
-
-    def fa(x):
-        if bitA == 32:
-            return x
-        return quantize(x, bitA)
-
-    global GRAD_DEFINED
-    if not GRAD_DEFINED:
-        @tf.RegisterGradient("FGGrad")
-        def grad_fg(op, x):
-            rank = x.get_shape().ndims
-            assert rank is not None
-            maxx = tf.reduce_max(tf.abs(x), list(range(1,rank)), keep_dims=True)
-            x = x / maxx
-            n = float(2**bitG-1)
-            x = x * 0.5 + 0.5 + tf.random_uniform(
-                    tf.shape(x), minval=-0.5/n, maxval=0.5/n)
-            x = tf.clip_by_value(x, 0.0, 1.0)
-            x = quantize(x, bitG) - 0.5
-            return x * maxx * 2
-    GRAD_DEFINED = True
-
-    def fg(x):
-        if bitG == 32:
-            return x
-        with G.gradient_override_map({"Identity": "FGGrad"}):
-            return tf.identity(x)
-    return fw, fa, fg
-
 class Model(ModelDesc):
    def _get_input_vars(self):
        return [InputVar(tf.float32, [None, 40, 40, 3], 'input'),