[ShuffleNet] add shufflenet v2

3e0515c2 · Yuxin Wu · 32e41776 · 3e0515c2 · 3e0515c2
Commit 3e0515c2 authored Aug 02, 2018 by Yuxin Wu
Show whitespace changes
Inline Side-by-side

Showing with 89 additions and 35 deletions

examples/ImageNetModels/README.md examples/ImageNetModels/README.md +14 -10

examples/ImageNetModels/shufflenet.py examples/ImageNetModels/shufflenet.py +75 -25

No files found.
--- a/examples/ImageNetModels/README.md
+++ b/examples/ImageNetModels/README.md
@@ -2,28 +2,32 @@
 ImageNet training code of ResNet, ShuffleNet, DoReFa-Net, AlexNet, Inception, VGG with tensorpack.
 To train any of the models, just do `./{model}.py --data /path/to/ilsvrc`.
+More options are available in `./{model}.py -h`.
 Expected format of data directory is described in [docs](http://tensorpack.readthedocs.io/en/latest/modules/dataflow.dataset.html#tensorpack.dataflow.dataset.ILSVRC12).
 Some pretrained models can be downloaded at [tensorpack model zoo](http://models.tensorpack.com/).
 ### ShuffleNet
-Reproduce [ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices](https://arxiv.org/abs/1707.01083)
+Reproduce ImageNet results of the following two papers:
-on ImageNet.
+ [ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices](https://arxiv.org/abs/1707.01083)
+ [ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design](https://arxiv.org/abs/1807.11164)
-This is a 38Mflops ShuffleNet, corresponding to `ShuffleNet 0.5x g=3` in the paper.
+| Model                                                                                                    | Flops | Top 1 Error | Flags         |
-After 240 epochs (36 hours on 8 P100s) it reaches top-1 error of 42.32%,
+|:---------------------------------------------------------------------------------------------------------|:------|:-----------:|:-------------:|
-matching the paper's number.
+| ShuffleNetV1 0.5x  [:arrow_down:](http://models.tensorpack.com/ImageNetModels/ShuffleNetV1-0.5x-g=8.npz) | 40M   | 40.8%       | `-r=0.5`      |
+| ShuffleNetV1 1x    [:arrow_down:](http://models.tensorpack.com/ImageNetModels/ShuffleNetV1-1x-g=8.npz)   | 140M  | 32.6%       | `-r=1`        |
+| ShuffleNetV2 0.5x  [:arrow_down:](http://models.tensorpack.com/ImageNetModels/ShuffleNetV2-0.5x.npz)     | 41M   | 39.5%       | `-r=0.5 --v2` |
+| ShuffleNetV2 1x    [:arrow_down:](http://models.tensorpack.com/ImageNetModels/ShuffleNetV2-1x.npz)       | 146M  | 30.6%       | `-r=1 --v2`   |
 To print flops:
 ```bash
-./shufflenet.py --flops
+./shufflenet.py --flops [--other-flags]
 ```
-It will print about 75Mflops, because the paper counts multiply+add as 1 flop.
-Download and evaluate the pretrained model:
+Download and evaluate a pretrained model:
 ```
-wget http://models.tensorpack.com/ImageNetModels/ShuffleNet.npz
+wget http://models.tensorpack.com/ImageNetModels/ShuffleNetV2-0.5x.npz
-./shufflenet.py --eval --data /path/to/ilsvrc --load ShuffleNet.npz
+./shufflenet.py --eval --data /path/to/ilsvrc --load ShuffleNetV2-0.5x.npz --v2 -r=0.5
 ```
 ### AlexNet

--- a/examples/ImageNetModels/shufflenet.py
+++ b/examples/ImageNetModels/shufflenet.py
@@ -30,7 +30,7 @@ def DepthConv(x, out_channel, kernel_shape, padding='SAME', stride=1,
              W_init=None, activation=tf.identity):
    in_shape = x.get_shape().as_list()
    in_channel = in_shape[1]
-    assert out_channel % in_channel == 0
+    assert out_channel % in_channel == 0, (out_channel, in_channel)
    channel_mult = out_channel // in_channel
    if W_init is None:
@@ -48,7 +48,7 @@ def channel_shuffle(l, group):
    in_shape = l.get_shape().as_list()
    in_channel = in_shape[1]
    assert in_channel % group == 0, in_channel
-    l = tf.reshape(l, [-1, group, in_channel // group] + in_shape[-2:])
+    l = tf.reshape(l, [-1, in_channel // group, group] + in_shape[-2:])
    l = tf.transpose(l, [0, 2, 1, 3, 4])
    l = tf.reshape(l, [-1, in_channel] + in_shape[-2:])
    return l
@@ -80,11 +80,36 @@ def shufflenet_unit(l, out_channel, group, stride):
    return output
+@layer_register()
+def shufflenet_unit_v2(l, out_channel, stride):
+    if stride == 1:
+        shortcut, l = tf.split(l, 2, axis=1)
+    else:
+        shortcut, l = l, l
+    shortcut_channel = shortcut.shape[1]
+    l = Conv2D('conv1', l, out_channel // 2, 1, activation=BNReLU)
+    l = DepthConv('dconv', l, out_channel // 2, 3, stride=stride)
+    l = BatchNorm('dconv_bn', l)
+    l = Conv2D('conv2', l, out_channel - shortcut_channel, 1, activation=BNReLU)
+    if stride == 2:
+        shortcut = DepthConv('shortcut_dconv', shortcut, shortcut_channel, 3, stride=2)
+        shortcut = BatchNorm('shortcut_dconv_bn', shortcut)
+        shortcut = Conv2D('shortcut_conv', shortcut, shortcut_channel, 1, activation=BNReLU)
+    output = tf.concat([shortcut, l], axis=1)
+    output = channel_shuffle(output, 2)
+    return output
 @layer_register(log_shape=True)
 def shufflenet_stage(input, channel, num_blocks, group):
    l = input
    for i in range(num_blocks):
        name = 'block{}'.format(i)
+        if args.v2:
+            l = shufflenet_unit_v2(name, l, channel, 2 if i == 0 else 1)
+        else:
            l = shufflenet_unit(name, l, channel, group, 2 if i == 0 else 1)
    return l
@@ -94,10 +119,12 @@ class Model(ImageNetModel):
    def get_logits(self, image):
-        with argscope([Conv2D, MaxPooling, AvgPooling, GlobalAvgPooling, BatchNorm], data_format=self.data_format), \
+        with argscope([Conv2D, MaxPooling, AvgPooling, GlobalAvgPooling, BatchNorm], data_format='channels_first'), \
                argscope(Conv2D, use_bias=False):
-            # See Table 1 & 2 in https://arxiv.org/abs/1707.01083
            group = args.group
+            if not args.v2:
+                # Copied from the paper
                channels = {
                    3: [240, 480, 960],
                    4: [272, 544, 1088],
@@ -108,14 +135,25 @@ class Model(ImageNetModel):
                            for x in channels[group]]
                # The first channel must be a multiple of group
                first_chan = int(math.ceil(24 * args.ratio / group) * group)
+            else:
+                # Copied from the paper
+                channels = {
+                    0.5: [48, 96, 192],
+                    1.: [116, 232, 464]
+                }[args.ratio]
+                first_chan = 24
            logger.info("#Channels: " + str([first_chan] + channels))
            l = Conv2D('conv1', image, first_chan, 3, strides=2, activation=BNReLU)
            l = MaxPooling('pool1', l, 3, 2, padding='SAME')
-            l = shufflenet_stage('group1', l, channels[0], 4, group)
+            l = shufflenet_stage('stage2', l, channels[0], 4, group)
-            l = shufflenet_stage('group2', l, channels[1], 8, group)
+            l = shufflenet_stage('stage3', l, channels[1], 8, group)
-            l = shufflenet_stage('group3', l, channels[2], 4, group)
+            l = shufflenet_stage('stage4', l, channels[2], 4, group)
+            if args.v2:
+                l = Conv2D('conv5', l, 1024, 1, activation=BNReLU)
            l = GlobalAvgPooling('gap', l)
            logits = FullyConnected('linear', l, 1000)
@@ -127,7 +165,8 @@ def get_data(name, batch):
    if isTrain:
        augmentors = [
-            GoogleNetResize(crop_area_fraction=0.49),
+            # use lighter augs if model is too small
+            GoogleNetResize(crop_area_fraction=0.49 if args.ratio < 1 else 0.08),
            imgaug.RandomOrderAug(
                [imgaug.BrightnessScale((0.6, 1.4), clip=False),
                 imgaug.Contrast((0.6, 1.4), clip=False),
@@ -192,9 +231,11 @@ if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
    parser.add_argument('--data', help='ILSVRC dataset dir')
-    parser.add_argument('--ratio', type=float, default=0.5, choices=[1., 0.5, 0.25])
+    parser.add_argument('-r', '--ratio', type=float, default=0.5, choices=[1., 0.5])
-    parser.add_argument('--group', type=int, default=3, choices=[3, 4, 8])
+    parser.add_argument('--group', type=int, default=8, choices=[3, 4, 8],
-    parser.add_argument('--load', help='load model')
+                        help="Number of groups for ShuffleNetV1")
+    parser.add_argument('--v2', action='store_true', help='Use ShuffleNetV2')
+    parser.add_argument('--load', help='path to load a model from')
    parser.add_argument('--eval', action='store_true')
    parser.add_argument('--flops', action='store_true', help='print flops and exit')
    args = parser.parse_args()
@@ -202,6 +243,9 @@ if __name__ == '__main__':
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
+    if args.v2 and args.group != parser.get_default('group'):
+        logger.error("group= is not used in ShuffleNetV2!")
    model = Model()
    if args.eval:
@@ -216,7 +260,7 @@ if __name__ == '__main__':
        ]
        input = PlaceholderInput()
        input.setup(input_desc)
-        with TowerContext('', is_training=True):
+        with TowerContext('', is_training=False):
            model.build_graph(*input.get_input_tensors())
        model_utils.describe_trainable_vars()
@@ -224,9 +268,15 @@ if __name__ == '__main__':
            tf.get_default_graph(),
            cmd='op',
            options=tf.profiler.ProfileOptionBuilder.float_operation())
+        logger.info("Note that TensorFlow counts flops in a different way from the paper.")
+        logger.info("TensorFlow counts multiply+add as two flops, however the paper counts them "
+                    "as 1 flop because it can be executed in one instruction.")
+    else:
+        if args.v2:
+            name = "ShuffleNetV2-{}x".format(args.ratio)
        else:
-        logger.set_logger_dir(os.path.join(
+            name = "ShuffleNetV1-{}x-g{}".format(args.ratio, args.group)
-            'train_log', 'shufflenet-{}x-g={}'.format(args.ratio, args.group)))
+        logger.set_logger_dir(os.path.join('train_log', name))
        nr_tower = max(get_num_gpu(), 1)
        config = get_config(model, nr_tower)