Commit 3e0515c2 authored by Yuxin Wu's avatar Yuxin Wu

[ShuffleNet] add shufflenet v2

parent 32e41776
...@@ -2,28 +2,32 @@ ...@@ -2,28 +2,32 @@
ImageNet training code of ResNet, ShuffleNet, DoReFa-Net, AlexNet, Inception, VGG with tensorpack. ImageNet training code of ResNet, ShuffleNet, DoReFa-Net, AlexNet, Inception, VGG with tensorpack.
To train any of the models, just do `./{model}.py --data /path/to/ilsvrc`. To train any of the models, just do `./{model}.py --data /path/to/ilsvrc`.
More options are available in `./{model}.py -h`.
Expected format of data directory is described in [docs](http://tensorpack.readthedocs.io/en/latest/modules/dataflow.dataset.html#tensorpack.dataflow.dataset.ILSVRC12). Expected format of data directory is described in [docs](http://tensorpack.readthedocs.io/en/latest/modules/dataflow.dataset.html#tensorpack.dataflow.dataset.ILSVRC12).
Some pretrained models can be downloaded at [tensorpack model zoo](http://models.tensorpack.com/). Some pretrained models can be downloaded at [tensorpack model zoo](http://models.tensorpack.com/).
### ShuffleNet ### ShuffleNet
Reproduce [ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices](https://arxiv.org/abs/1707.01083) Reproduce ImageNet results of the following two papers:
on ImageNet. + [ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices](https://arxiv.org/abs/1707.01083)
+ [ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design](https://arxiv.org/abs/1807.11164)
This is a 38Mflops ShuffleNet, corresponding to `ShuffleNet 0.5x g=3` in the paper. | Model | Flops | Top 1 Error | Flags |
After 240 epochs (36 hours on 8 P100s) it reaches top-1 error of 42.32%, |:---------------------------------------------------------------------------------------------------------|:------|:-----------:|:-------------:|
matching the paper's number. | ShuffleNetV1 0.5x [:arrow_down:](http://models.tensorpack.com/ImageNetModels/ShuffleNetV1-0.5x-g=8.npz) | 40M | 40.8% | `-r=0.5` |
| ShuffleNetV1 1x [:arrow_down:](http://models.tensorpack.com/ImageNetModels/ShuffleNetV1-1x-g=8.npz) | 140M | 32.6% | `-r=1` |
| ShuffleNetV2 0.5x [:arrow_down:](http://models.tensorpack.com/ImageNetModels/ShuffleNetV2-0.5x.npz) | 41M | 39.5% | `-r=0.5 --v2` |
| ShuffleNetV2 1x [:arrow_down:](http://models.tensorpack.com/ImageNetModels/ShuffleNetV2-1x.npz) | 146M | 30.6% | `-r=1 --v2` |
To print flops: To print flops:
```bash ```bash
./shufflenet.py --flops ./shufflenet.py --flops [--other-flags]
``` ```
It will print about 75Mflops, because the paper counts multiply+add as 1 flop.
Download and evaluate the pretrained model: Download and evaluate a pretrained model:
``` ```
wget http://models.tensorpack.com/ImageNetModels/ShuffleNet.npz wget http://models.tensorpack.com/ImageNetModels/ShuffleNetV2-0.5x.npz
./shufflenet.py --eval --data /path/to/ilsvrc --load ShuffleNet.npz ./shufflenet.py --eval --data /path/to/ilsvrc --load ShuffleNetV2-0.5x.npz --v2 -r=0.5
``` ```
### AlexNet ### AlexNet
......
...@@ -30,7 +30,7 @@ def DepthConv(x, out_channel, kernel_shape, padding='SAME', stride=1, ...@@ -30,7 +30,7 @@ def DepthConv(x, out_channel, kernel_shape, padding='SAME', stride=1,
W_init=None, activation=tf.identity): W_init=None, activation=tf.identity):
in_shape = x.get_shape().as_list() in_shape = x.get_shape().as_list()
in_channel = in_shape[1] in_channel = in_shape[1]
assert out_channel % in_channel == 0 assert out_channel % in_channel == 0, (out_channel, in_channel)
channel_mult = out_channel // in_channel channel_mult = out_channel // in_channel
if W_init is None: if W_init is None:
...@@ -48,7 +48,7 @@ def channel_shuffle(l, group): ...@@ -48,7 +48,7 @@ def channel_shuffle(l, group):
in_shape = l.get_shape().as_list() in_shape = l.get_shape().as_list()
in_channel = in_shape[1] in_channel = in_shape[1]
assert in_channel % group == 0, in_channel assert in_channel % group == 0, in_channel
l = tf.reshape(l, [-1, group, in_channel // group] + in_shape[-2:]) l = tf.reshape(l, [-1, in_channel // group, group] + in_shape[-2:])
l = tf.transpose(l, [0, 2, 1, 3, 4]) l = tf.transpose(l, [0, 2, 1, 3, 4])
l = tf.reshape(l, [-1, in_channel] + in_shape[-2:]) l = tf.reshape(l, [-1, in_channel] + in_shape[-2:])
return l return l
...@@ -80,11 +80,36 @@ def shufflenet_unit(l, out_channel, group, stride): ...@@ -80,11 +80,36 @@ def shufflenet_unit(l, out_channel, group, stride):
return output return output
@layer_register()
def shufflenet_unit_v2(l, out_channel, stride):
if stride == 1:
shortcut, l = tf.split(l, 2, axis=1)
else:
shortcut, l = l, l
shortcut_channel = shortcut.shape[1]
l = Conv2D('conv1', l, out_channel // 2, 1, activation=BNReLU)
l = DepthConv('dconv', l, out_channel // 2, 3, stride=stride)
l = BatchNorm('dconv_bn', l)
l = Conv2D('conv2', l, out_channel - shortcut_channel, 1, activation=BNReLU)
if stride == 2:
shortcut = DepthConv('shortcut_dconv', shortcut, shortcut_channel, 3, stride=2)
shortcut = BatchNorm('shortcut_dconv_bn', shortcut)
shortcut = Conv2D('shortcut_conv', shortcut, shortcut_channel, 1, activation=BNReLU)
output = tf.concat([shortcut, l], axis=1)
output = channel_shuffle(output, 2)
return output
@layer_register(log_shape=True) @layer_register(log_shape=True)
def shufflenet_stage(input, channel, num_blocks, group): def shufflenet_stage(input, channel, num_blocks, group):
l = input l = input
for i in range(num_blocks): for i in range(num_blocks):
name = 'block{}'.format(i) name = 'block{}'.format(i)
if args.v2:
l = shufflenet_unit_v2(name, l, channel, 2 if i == 0 else 1)
else:
l = shufflenet_unit(name, l, channel, group, 2 if i == 0 else 1) l = shufflenet_unit(name, l, channel, group, 2 if i == 0 else 1)
return l return l
...@@ -94,10 +119,12 @@ class Model(ImageNetModel): ...@@ -94,10 +119,12 @@ class Model(ImageNetModel):
def get_logits(self, image): def get_logits(self, image):
with argscope([Conv2D, MaxPooling, AvgPooling, GlobalAvgPooling, BatchNorm], data_format=self.data_format), \ with argscope([Conv2D, MaxPooling, AvgPooling, GlobalAvgPooling, BatchNorm], data_format='channels_first'), \
argscope(Conv2D, use_bias=False): argscope(Conv2D, use_bias=False):
# See Table 1 & 2 in https://arxiv.org/abs/1707.01083
group = args.group group = args.group
if not args.v2:
# Copied from the paper
channels = { channels = {
3: [240, 480, 960], 3: [240, 480, 960],
4: [272, 544, 1088], 4: [272, 544, 1088],
...@@ -108,14 +135,25 @@ class Model(ImageNetModel): ...@@ -108,14 +135,25 @@ class Model(ImageNetModel):
for x in channels[group]] for x in channels[group]]
# The first channel must be a multiple of group # The first channel must be a multiple of group
first_chan = int(math.ceil(24 * args.ratio / group) * group) first_chan = int(math.ceil(24 * args.ratio / group) * group)
else:
# Copied from the paper
channels = {
0.5: [48, 96, 192],
1.: [116, 232, 464]
}[args.ratio]
first_chan = 24
logger.info("#Channels: " + str([first_chan] + channels)) logger.info("#Channels: " + str([first_chan] + channels))
l = Conv2D('conv1', image, first_chan, 3, strides=2, activation=BNReLU) l = Conv2D('conv1', image, first_chan, 3, strides=2, activation=BNReLU)
l = MaxPooling('pool1', l, 3, 2, padding='SAME') l = MaxPooling('pool1', l, 3, 2, padding='SAME')
l = shufflenet_stage('group1', l, channels[0], 4, group) l = shufflenet_stage('stage2', l, channels[0], 4, group)
l = shufflenet_stage('group2', l, channels[1], 8, group) l = shufflenet_stage('stage3', l, channels[1], 8, group)
l = shufflenet_stage('group3', l, channels[2], 4, group) l = shufflenet_stage('stage4', l, channels[2], 4, group)
if args.v2:
l = Conv2D('conv5', l, 1024, 1, activation=BNReLU)
l = GlobalAvgPooling('gap', l) l = GlobalAvgPooling('gap', l)
logits = FullyConnected('linear', l, 1000) logits = FullyConnected('linear', l, 1000)
...@@ -127,7 +165,8 @@ def get_data(name, batch): ...@@ -127,7 +165,8 @@ def get_data(name, batch):
if isTrain: if isTrain:
augmentors = [ augmentors = [
GoogleNetResize(crop_area_fraction=0.49), # use lighter augs if model is too small
GoogleNetResize(crop_area_fraction=0.49 if args.ratio < 1 else 0.08),
imgaug.RandomOrderAug( imgaug.RandomOrderAug(
[imgaug.BrightnessScale((0.6, 1.4), clip=False), [imgaug.BrightnessScale((0.6, 1.4), clip=False),
imgaug.Contrast((0.6, 1.4), clip=False), imgaug.Contrast((0.6, 1.4), clip=False),
...@@ -192,9 +231,11 @@ if __name__ == '__main__': ...@@ -192,9 +231,11 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.') parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
parser.add_argument('--data', help='ILSVRC dataset dir') parser.add_argument('--data', help='ILSVRC dataset dir')
parser.add_argument('--ratio', type=float, default=0.5, choices=[1., 0.5, 0.25]) parser.add_argument('-r', '--ratio', type=float, default=0.5, choices=[1., 0.5])
parser.add_argument('--group', type=int, default=3, choices=[3, 4, 8]) parser.add_argument('--group', type=int, default=8, choices=[3, 4, 8],
parser.add_argument('--load', help='load model') help="Number of groups for ShuffleNetV1")
parser.add_argument('--v2', action='store_true', help='Use ShuffleNetV2')
parser.add_argument('--load', help='path to load a model from')
parser.add_argument('--eval', action='store_true') parser.add_argument('--eval', action='store_true')
parser.add_argument('--flops', action='store_true', help='print flops and exit') parser.add_argument('--flops', action='store_true', help='print flops and exit')
args = parser.parse_args() args = parser.parse_args()
...@@ -202,6 +243,9 @@ if __name__ == '__main__': ...@@ -202,6 +243,9 @@ if __name__ == '__main__':
if args.gpu: if args.gpu:
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
if args.v2 and args.group != parser.get_default('group'):
logger.error("group= is not used in ShuffleNetV2!")
model = Model() model = Model()
if args.eval: if args.eval:
...@@ -216,7 +260,7 @@ if __name__ == '__main__': ...@@ -216,7 +260,7 @@ if __name__ == '__main__':
] ]
input = PlaceholderInput() input = PlaceholderInput()
input.setup(input_desc) input.setup(input_desc)
with TowerContext('', is_training=True): with TowerContext('', is_training=False):
model.build_graph(*input.get_input_tensors()) model.build_graph(*input.get_input_tensors())
model_utils.describe_trainable_vars() model_utils.describe_trainable_vars()
...@@ -224,9 +268,15 @@ if __name__ == '__main__': ...@@ -224,9 +268,15 @@ if __name__ == '__main__':
tf.get_default_graph(), tf.get_default_graph(),
cmd='op', cmd='op',
options=tf.profiler.ProfileOptionBuilder.float_operation()) options=tf.profiler.ProfileOptionBuilder.float_operation())
logger.info("Note that TensorFlow counts flops in a different way from the paper.")
logger.info("TensorFlow counts multiply+add as two flops, however the paper counts them "
"as 1 flop because it can be executed in one instruction.")
else:
if args.v2:
name = "ShuffleNetV2-{}x".format(args.ratio)
else: else:
logger.set_logger_dir(os.path.join( name = "ShuffleNetV1-{}x-g{}".format(args.ratio, args.group)
'train_log', 'shufflenet-{}x-g={}'.format(args.ratio, args.group))) logger.set_logger_dir(os.path.join('train_log', name))
nr_tower = max(get_num_gpu(), 1) nr_tower = max(get_num_gpu(), 1)
config = get_config(model, nr_tower) config = get_config(model, nr_tower)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment