Commit ed444aab authored by Yuxin Wu's avatar Yuxin Wu

Update ShuffleNet with different configs

parent 7cb2606c
...@@ -14,13 +14,14 @@ This is a good set of baselines for research in model quantization. ...@@ -14,13 +14,14 @@ This is a good set of baselines for research in model quantization.
These quantization techniques, when applied on AlexNet, achieves the following ImageNet performance in this implementation: These quantization techniques, when applied on AlexNet, achieves the following ImageNet performance in this implementation:
| Model | Bit Width <br/> (weights, activations, gradients) | Top 1 Validation Error <sup>[1](#ft1)</sup> | | Model | Bit Width <br/> (weights, activations, gradients) | Top 1 Validation Error <sup>[1](#ft1)</sup> |
|:----------------------------------:|:-------------------------------------------------:|:-----------------------------------------------------------------------------:| |:----------------------------------:|:-------------------------------------------------:|:-------------------------------------------------------------------------------:|
| Full Precision<sup>[2](#ft2)</sup> | 32,32,32 | 40.3% | | Full Precision<sup>[2](#ft2)</sup> | 32,32,32 | 40.3% |
| TTQ | t,32,32 | 42.0% | | TTQ | t,32,32 | 42.0% |
| BWN | 1,32,32 | 44.6% | | BWN | 1,32,32 | 44.6% |
| BNN | 1,1,32 | 51.9% | | BNN | 1,1,32 | 51.9% |
| DoReFa | 8,8,8 | 42.0% [:arrow_down:](http://models.tensorpack.com/DoReFa-Net/AlexNet-8,8,8.npz) |
| DoReFa | 1,2,32 | 46.6% | | DoReFa | 1,2,32 | 46.6% |
| DoReFa | 1,2,6 | 46.8% [:arrow_down:](http://models.tensorpack.com/DoReFa-Net/alexnet-126.npz) | | DoReFa | 1,2,6 | 46.8% [:arrow_down:](http://models.tensorpack.com/DoReFa-Net/AlexNet-1,2,6.npz) |
| DoReFa | 1,2,4 | 54.0% | | DoReFa | 1,2,4 | 54.0% |
<a id="ft1">1</a>: These numbers were obtained by training on 8 GPUs with a total batch size of 256. <a id="ft1">1</a>: These numbers were obtained by training on 8 GPUs with a total batch size of 256.
......
...@@ -10,8 +10,7 @@ Pretrained models can be downloaded at [tensorpack model zoo](http://models.tens ...@@ -10,8 +10,7 @@ Pretrained models can be downloaded at [tensorpack model zoo](http://models.tens
Reproduce [ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices](https://arxiv.org/abs/1707.01083) Reproduce [ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices](https://arxiv.org/abs/1707.01083)
on ImageNet. on ImageNet.
This is a 38Mflops ShuffleNet, corresponding to `ShuffleNet 0.5x g=3` in __the This is a 38Mflops ShuffleNet, corresponding to `ShuffleNet 0.5x g=3` in the paper.
2nd arxiv version__ of the paper.
After 240 epochs (36 hours on 8 P100s) it reaches top-1 error of 42.32%, After 240 epochs (36 hours on 8 P100s) it reaches top-1 error of 42.32%,
matching the paper's number. matching the paper's number.
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
import argparse import argparse
import numpy as np import numpy as np
import math
import os import os
import cv2 import cv2
...@@ -15,6 +16,7 @@ from tensorpack.dataflow import imgaug ...@@ -15,6 +16,7 @@ from tensorpack.dataflow import imgaug
from tensorpack.tfutils import argscope, get_model_loader, model_utils from tensorpack.tfutils import argscope, get_model_loader, model_utils
from tensorpack.tfutils.scope_utils import under_name_scope from tensorpack.tfutils.scope_utils import under_name_scope
from tensorpack.utils.gpu import get_num_gpu from tensorpack.utils.gpu import get_num_gpu
from tensorpack.utils import logger
from imagenet_utils import ( from imagenet_utils import (
get_imagenet_dataflow, get_imagenet_dataflow,
...@@ -52,29 +54,24 @@ def channel_shuffle(l, group): ...@@ -52,29 +54,24 @@ def channel_shuffle(l, group):
return l return l
def BN(x, name=None): @layer_register()
return BatchNorm('bn', x) def shufflenet_unit(l, out_channel, group, stride):
class Model(ImageNetModel):
weight_decay = 4e-5
def get_logits(self, image):
def shufflenet_unit(l, out_channel, group, stride):
in_shape = l.get_shape().as_list() in_shape = l.get_shape().as_list()
in_channel = in_shape[1] in_channel = in_shape[1]
shortcut = l shortcut = l
# We do not apply group convolution on the first pointwise layer # "We do not apply group convolution on the first pointwise layer
# because the number of input channels is relatively small. # because the number of input channels is relatively small."
first_split = group if in_channel != 12 else 1 first_split = group if in_channel > 24 else 1
l = Conv2D('conv1', l, out_channel // 4, 1, split=first_split, activation=BNReLU) l = Conv2D('conv1', l, out_channel // 4, 1, split=first_split, activation=BNReLU)
l = channel_shuffle(l, group) l = channel_shuffle(l, group)
l = DepthConv('dconv', l, out_channel // 4, 3, activation=BN, stride=stride) l = DepthConv('dconv', l, out_channel // 4, 3, stride=stride)
l = BatchNorm('dconv_bn', l)
l = Conv2D('conv2', l, l = Conv2D('conv2', l,
out_channel if stride == 1 else out_channel - in_channel, out_channel if stride == 1 else out_channel - in_channel,
1, split=group, activation=BN) 1, split=group)
l = BatchNorm('conv2_bn', l)
if stride == 1: # unit (b) if stride == 1: # unit (b)
output = tf.nn.relu(shortcut + l) output = tf.nn.relu(shortcut + l)
else: # unit (c) else: # unit (c)
...@@ -82,28 +79,44 @@ class Model(ImageNetModel): ...@@ -82,28 +79,44 @@ class Model(ImageNetModel):
output = tf.concat([shortcut, tf.nn.relu(l)], axis=1) output = tf.concat([shortcut, tf.nn.relu(l)], axis=1)
return output return output
@layer_register(log_shape=True)
def shufflenet_stage(input, channel, num_blocks, group):
l = input
for i in range(num_blocks):
name = 'block{}'.format(i)
l = shufflenet_unit(name, l, channel, group, 2 if i == 0 else 1)
return l
class Model(ImageNetModel):
weight_decay = 4e-5
def get_logits(self, image):
with argscope([Conv2D, MaxPooling, AvgPooling, GlobalAvgPooling, BatchNorm], data_format=self.data_format), \ with argscope([Conv2D, MaxPooling, AvgPooling, GlobalAvgPooling, BatchNorm], data_format=self.data_format), \
argscope(Conv2D, use_bias=False): argscope(Conv2D, use_bias=False):
group = 3 # See Table 1 & 2 in https://arxiv.org/abs/1707.01083
channels = [120, 240, 480] group = args.group
channels = {
l = Conv2D('conv1', image, 12, 3, strides=2, activation=BNReLU) 3: [240, 480, 960],
4: [272, 544, 1088],
8: [384, 768, 1536]
}
mul = group * 4 # #chan has to be a multiple of this number
channels = [int(math.ceil(x * args.ratio / mul) * mul)
for x in channels[group]]
# The first channel must be a multiple of group
first_chan = int(math.ceil(24 * args.ratio / group) * group)
logger.info("#Channels: " + str([first_chan] + channels))
l = Conv2D('conv1', image, first_chan, 3, strides=2, activation=BNReLU)
l = MaxPooling('pool1', l, 3, 2, padding='SAME') l = MaxPooling('pool1', l, 3, 2, padding='SAME')
with tf.variable_scope('group1'): l = shufflenet_stage('group1', l, channels[0], 4, group)
for i in range(4): l = shufflenet_stage('group2', l, channels[1], 8, group)
with tf.variable_scope('block{}'.format(i)): l = shufflenet_stage('group3', l, channels[2], 4, group)
l = shufflenet_unit(l, channels[0], group, 2 if i == 0 else 1)
with tf.variable_scope('group2'):
for i in range(8):
with tf.variable_scope('block{}'.format(i)):
l = shufflenet_unit(l, channels[1], group, 2 if i == 0 else 1)
with tf.variable_scope('group3'):
for i in range(4):
with tf.variable_scope('block{}'.format(i)):
l = shufflenet_unit(l, channels[2], group, 2 if i == 0 else 1)
l = GlobalAvgPooling('gap', l) l = GlobalAvgPooling('gap', l)
logits = FullyConnected('linear', l, 1000) logits = FullyConnected('linear', l, 1000)
return logits return logits
...@@ -179,6 +192,8 @@ if __name__ == '__main__': ...@@ -179,6 +192,8 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.') parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
parser.add_argument('--data', help='ILSVRC dataset dir') parser.add_argument('--data', help='ILSVRC dataset dir')
parser.add_argument('--ratio', type=float, default=0.5, choices=[1., 0.5, 0.25])
parser.add_argument('--group', type=int, default=3, choices=[3, 4, 8])
parser.add_argument('--load', help='load model') parser.add_argument('--load', help='load model')
parser.add_argument('--eval', action='store_true') parser.add_argument('--eval', action='store_true')
parser.add_argument('--flops', action='store_true', help='print flops and exit') parser.add_argument('--flops', action='store_true', help='print flops and exit')
...@@ -210,7 +225,8 @@ if __name__ == '__main__': ...@@ -210,7 +225,8 @@ if __name__ == '__main__':
cmd='op', cmd='op',
options=tf.profiler.ProfileOptionBuilder.float_operation()) options=tf.profiler.ProfileOptionBuilder.float_operation())
else: else:
logger.set_logger_dir(os.path.join('train_log', 'shufflenet')) logger.set_logger_dir(os.path.join(
'train_log', 'shufflenet-{}x-g={}'.format(args.ratio, args.group)))
nr_tower = max(get_num_gpu(), 1) nr_tower = max(get_num_gpu(), 1)
config = get_config(model, nr_tower) config = get_config(model, nr_tower)
......
...@@ -92,7 +92,7 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, ...@@ -92,7 +92,7 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
They are very similar in speed, but `internal_update=True` can be used They are very similar in speed, but `internal_update=True` can be used
when you have conditionals in your model, or when you have multiple networks to train. when you have conditionals in your model, or when you have multiple networks to train.
Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699 Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699
sync_statistics (str or None): one of None "nccl", or "horovod". sync_statistics (str or None): one of None, "nccl", or "horovod".
By default (None), it uses statistics of the input tensor to normalize. By default (None), it uses statistics of the input tensor to normalize.
This is the standard way BatchNorm was done in most frameworks. This is the standard way BatchNorm was done in most frameworks.
......
...@@ -251,7 +251,7 @@ class ScaleGradient(MapGradient): ...@@ -251,7 +251,7 @@ class ScaleGradient(MapGradient):
if re.match(regex, varname): if re.match(regex, varname):
if self._verbose: if self._verbose:
logger.info("Apply lr multiplier {} for {}".format(val, varname)) logger.info("Gradient of '{}' is multipled by {}".format(varname, val))
if val != 0: # skip zero to speed up if val != 0: # skip zero to speed up
return grad * val return grad * val
else: else:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment