Update ShuffleNet with different configs

ed444aab · Yuxin Wu · 7cb2606c · ed444aab · ed444aab · ed444aab
Commit ed444aab authored Jul 29, 2018 by Yuxin Wu
5 changed files
--- a/examples/DoReFa-Net/README.md
+++ b/examples/DoReFa-Net/README.md
@@ -14,13 +14,14 @@ This is a good set of baselines for research in model quantization.
 These quantization techniques, when applied on AlexNet, achieves the following ImageNet performance in this implementation:

 | Model                              | Bit Width <br/> (weights, activations, gradients) | Top 1 Validation Error <sup>[1](#ft1)</sup>                                     |
-|:----------------------------------:|:-------------------------------------------------:|:-----------------------------------------------------------------------------:|
+|:----------------------------------:|:-------------------------------------------------:|:-------------------------------------------------------------------------------:|
 | Full Precision<sup>[2](#ft2)</sup> | 32,32,32                                          | 40.3%                                                                           |
 | TTQ                                | t,32,32                                           | 42.0%                                                                           |
 | BWN                                | 1,32,32                                           | 44.6%                                                                           |
 | BNN                                | 1,1,32                                            | 51.9%                                                                           |
+| DoReFa                             | 8,8,8                                             | 42.0% [:arrow_down:](http://models.tensorpack.com/DoReFa-Net/AlexNet-8,8,8.npz) |
 | DoReFa                             | 1,2,32                                            | 46.6%                                                                           |
-| DoReFa                             | 1,2,6                                             | 46.8% [:arrow_down:](http://models.tensorpack.com/DoReFa-Net/alexnet-126.npz) |
+| DoReFa                             | 1,2,6                                             | 46.8% [:arrow_down:](http://models.tensorpack.com/DoReFa-Net/AlexNet-1,2,6.npz) |
 | DoReFa                             | 1,2,4                                             | 54.0%                                                                           |

 <a id="ft1">1</a>: These numbers were obtained by training on 8 GPUs with a total batch size of 256.

--- a/examples/ImageNetModels/README.md
+++ b/examples/ImageNetModels/README.md
@@ -10,8 +10,7 @@ Pretrained models can be downloaded at [tensorpack model zoo](http://models.tens
 Reproduce [ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices](https://arxiv.org/abs/1707.01083)
 on ImageNet.

-This is a 38Mflops ShuffleNet, corresponding to `ShuffleNet 0.5x g=3` in __the
-2nd arxiv version__ of the paper.
+This is a 38Mflops ShuffleNet, corresponding to `ShuffleNet 0.5x g=3` in the paper.
 After 240 epochs (36 hours on 8 P100s) it reaches top-1 error of 42.32%,
 matching the paper's number.


--- a/examples/ImageNetModels/shufflenet.py
+++ b/examples/ImageNetModels/shufflenet.py
@@ -4,6 +4,7 @@

 import argparse
 import numpy as np
+import math
 import os
 import cv2

@@ -15,6 +16,7 @@ from tensorpack.dataflow import imgaug
 from tensorpack.tfutils import argscope, get_model_loader, model_utils
 from tensorpack.tfutils.scope_utils import under_name_scope
 from tensorpack.utils.gpu import get_num_gpu
+from tensorpack.utils import logger

 from imagenet_utils import (
    get_imagenet_dataflow,
@@ -52,29 +54,24 @@ def channel_shuffle(l, group):
    return l


-def BN(x, name=None):
-    return BatchNorm('bn', x)
-
-
-class Model(ImageNetModel):
-    weight_decay = 4e-5
-
-    def get_logits(self, image):
-        def shufflenet_unit(l, out_channel, group, stride):
+@layer_register()
+def shufflenet_unit(l, out_channel, group, stride):
    in_shape = l.get_shape().as_list()
    in_channel = in_shape[1]
    shortcut = l

-            # We do not apply group convolution on the first pointwise layer
-            # because the number of input channels is relatively small.
-            first_split = group if in_channel != 12 else 1
+    # "We do not apply group convolution on the first pointwise layer
+    #  because the number of input channels is relatively small."
+    first_split = group if in_channel > 24 else 1
    l = Conv2D('conv1', l, out_channel // 4, 1, split=first_split, activation=BNReLU)
    l = channel_shuffle(l, group)
-            l = DepthConv('dconv', l, out_channel // 4, 3, activation=BN, stride=stride)
+    l = DepthConv('dconv', l, out_channel // 4, 3, stride=stride)
+    l = BatchNorm('dconv_bn', l)

    l = Conv2D('conv2', l,
               out_channel if stride == 1 else out_channel - in_channel,
-                       1, split=group, activation=BN)
+               1, split=group)
+    l = BatchNorm('conv2_bn', l)
    if stride == 1:     # unit (b)
        output = tf.nn.relu(shortcut + l)
    else:   # unit (c)
@@ -82,28 +79,44 @@ class Model(ImageNetModel):
        output = tf.concat([shortcut, tf.nn.relu(l)], axis=1)
    return output

+
+@layer_register(log_shape=True)
+def shufflenet_stage(input, channel, num_blocks, group):
+    l = input
+    for i in range(num_blocks):
+        name = 'block{}'.format(i)
+        l = shufflenet_unit(name, l, channel, group, 2 if i == 0 else 1)
+    return l
+
+
+class Model(ImageNetModel):
+    weight_decay = 4e-5
+
+    def get_logits(self, image):
+
        with argscope([Conv2D, MaxPooling, AvgPooling, GlobalAvgPooling, BatchNorm], data_format=self.data_format), \
                argscope(Conv2D, use_bias=False):
-            group = 3
-            channels = [120, 240, 480]
-
-            l = Conv2D('conv1', image, 12, 3, strides=2, activation=BNReLU)
+            # See Table 1 & 2 in https://arxiv.org/abs/1707.01083
+            group = args.group
+            channels = {
+                3: [240, 480, 960],
+                4: [272, 544, 1088],
+                8: [384, 768, 1536]
+            }
+            mul = group * 4  # #chan has to be a multiple of this number
+            channels = [int(math.ceil(x * args.ratio / mul) * mul)
+                        for x in channels[group]]
+            # The first channel must be a multiple of group
+            first_chan = int(math.ceil(24 * args.ratio / group) * group)
+            logger.info("#Channels: " + str([first_chan] + channels))
+
+            l = Conv2D('conv1', image, first_chan, 3, strides=2, activation=BNReLU)
            l = MaxPooling('pool1', l, 3, 2, padding='SAME')

-            with tf.variable_scope('group1'):
-                for i in range(4):
-                    with tf.variable_scope('block{}'.format(i)):
-                        l = shufflenet_unit(l, channels[0], group, 2 if i == 0 else 1)
-
-            with tf.variable_scope('group2'):
-                for i in range(8):
-                    with tf.variable_scope('block{}'.format(i)):
-                        l = shufflenet_unit(l, channels[1], group, 2 if i == 0 else 1)
+            l = shufflenet_stage('group1', l, channels[0], 4, group)
+            l = shufflenet_stage('group2', l, channels[1], 8, group)
+            l = shufflenet_stage('group3', l, channels[2], 4, group)

-            with tf.variable_scope('group3'):
-                for i in range(4):
-                    with tf.variable_scope('block{}'.format(i)):
-                        l = shufflenet_unit(l, channels[2], group, 2 if i == 0 else 1)
            l = GlobalAvgPooling('gap', l)
            logits = FullyConnected('linear', l, 1000)
            return logits
@@ -179,6 +192,8 @@ if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
    parser.add_argument('--data', help='ILSVRC dataset dir')
+    parser.add_argument('--ratio', type=float, default=0.5, choices=[1., 0.5, 0.25])
+    parser.add_argument('--group', type=int, default=3, choices=[3, 4, 8])
    parser.add_argument('--load', help='load model')
    parser.add_argument('--eval', action='store_true')
    parser.add_argument('--flops', action='store_true', help='print flops and exit')
@@ -210,7 +225,8 @@ if __name__ == '__main__':
            cmd='op',
            options=tf.profiler.ProfileOptionBuilder.float_operation())
    else:
-        logger.set_logger_dir(os.path.join('train_log', 'shufflenet'))
+        logger.set_logger_dir(os.path.join(
+            'train_log', 'shufflenet-{}x-g={}'.format(args.ratio, args.group)))

        nr_tower = max(get_num_gpu(), 1)
        config = get_config(model, nr_tower)

--- a/tensorpack/models/batch_norm.py
+++ b/tensorpack/models/batch_norm.py
@@ -92,7 +92,7 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
          They are very similar in speed, but `internal_update=True` can be used
          when you have conditionals in your model, or when you have multiple networks to train.
          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699
-        sync_statistics (str or None): one of None "nccl", or "horovod".
+        sync_statistics (str or None): one of None, "nccl", or "horovod".

          By default (None), it uses statistics of the input tensor to normalize.
          This is the standard way BatchNorm was done in most frameworks.

--- a/tensorpack/tfutils/gradproc.py
+++ b/tensorpack/tfutils/gradproc.py
@@ -251,7 +251,7 @@ class ScaleGradient(MapGradient):

            if re.match(regex, varname):
                if self._verbose:
-                    logger.info("Apply lr multiplier {} for {}".format(val, varname))
+                    logger.info("Gradient of '{}' is multipled by {}".format(varname, val))
                if val != 0:    # skip zero to speed up
                    return grad * val
                else: