Commit 3e9f164d authored by Yuxin Wu's avatar Yuxin Wu

Upgrade shufflenet; fix paramsetter for restore

parent 9185744d
...@@ -54,7 +54,7 @@ class Model(DQNModel): ...@@ -54,7 +54,7 @@ class Model(DQNModel):
def _get_DQN_prediction(self, image): def _get_DQN_prediction(self, image):
""" image: [0,255]""" """ image: [0,255]"""
image = image / 255.0 image = image / 255.0
with argscope(Conv2D, nl=PReLU.symbolic_function, use_bias=True): with argscope(Conv2D, activation=lambda x: PReLU('prelu', x), use_bias=True):
l = (LinearWrap(image) l = (LinearWrap(image)
# Nature architecture # Nature architecture
.Conv2D('conv0', out_channel=32, kernel_shape=8, stride=4) .Conv2D('conv0', out_channel=32, kernel_shape=8, stride=4)
......
...@@ -15,6 +15,8 @@ assert tensorpack.tfutils.common.get_tf_version_number() >= 1.2 ...@@ -15,6 +15,8 @@ assert tensorpack.tfutils.common.get_tf_version_number() >= 1.2
class Model(ModelDesc): class Model(ModelDesc):
learning_rate = 1e-3
def __init__(self, image_shape, channel, method, num_actions, gamma): def __init__(self, image_shape, channel, method, num_actions, gamma):
self.image_shape = image_shape self.image_shape = image_shape
self.channel = channel self.channel = channel
...@@ -80,7 +82,7 @@ class Model(ModelDesc): ...@@ -80,7 +82,7 @@ class Model(ModelDesc):
summary.add_moving_summary(self.cost) summary.add_moving_summary(self.cost)
def _get_optimizer(self): def _get_optimizer(self):
lr = tf.get_variable('learning_rate', initializer=1e-3, trainable=False) lr = tf.get_variable('learning_rate', initializer=self.learning_rate, trainable=False)
opt = tf.train.AdamOptimizer(lr, epsilon=1e-3) opt = tf.train.AdamOptimizer(lr, epsilon=1e-3)
return optimizer.apply_grad_processors( return optimizer.apply_grad_processors(
opt, [gradproc.GlobalNormClip(10), gradproc.SummaryGradient()]) opt, [gradproc.GlobalNormClip(10), gradproc.SummaryGradient()])
......
...@@ -546,3 +546,35 @@ def maskrcnn_loss(mask_logits, fg_labels, fg_target_masks): ...@@ -546,3 +546,35 @@ def maskrcnn_loss(mask_logits, fg_labels, fg_target_masks):
add_moving_summary(loss, accuracy, fg_pixel_ratio, pos_accuracy) add_moving_summary(loss, accuracy, fg_pixel_ratio, pos_accuracy)
return loss return loss
if __name__ == '__main__':
"""
Demonstrate what's wrong with tf.image.crop_and_resize:
"""
import numpy as np
import tensorflow.contrib.eager as tfe
tfe.enable_eager_execution()
# want to crop 2x2 out of a 5x5 image, and resize to 4x4
image = np.arange(25).astype('float32').reshape(5, 5)
boxes = np.asarray([[1, 1, 3, 3]], dtype='float32')
target = 4
print(crop_and_resize(
image[None, None, :, :], boxes, [0], target)[0][0])
"""
Expected values:
4.5 5 5.5 6
7 7.5 8 8.5
9.5 10 10.5 11
12 12.5 13 13.5
Our implementation is not perfect either. When boxes are on the border of
images, TF pads zeros instead of border values. But this rarely happens so it's fine.
You cannot easily get the above results with tf.image.crop_and_resize.
Try out yourself here:
"""
print(tf.image.crop_and_resize(
image[None, :, :, None],
np.asarray([[1, 1, 2, 2]]) / 4.0, [0], [target, target])[0][:, :, 0])
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
Reproduce [ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices](https://arxiv.org/abs/1707.01083) Reproduce [ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices](https://arxiv.org/abs/1707.01083)
on ImageNet. on ImageNet.
This is a 40Mflops ShuffleNet, corresponding to `ShuffleNet 0.5x (arch2) g=8` in the paper. This is a 38Mflops ShuffleNet, corresponding to `ShuffleNet 0.5x g=3` in [version 2](https://arxiv.org/pdf/1707.01083v2) of the paper.
After 100 epochs it reaches top-1 error of 42.62, matching the paper's number. After 240 epochs it reaches top-1 error of 42.32, better than the paper's number.
### Usage: ### Usage:
...@@ -13,14 +13,14 @@ Print flops with tensorflow: ...@@ -13,14 +13,14 @@ Print flops with tensorflow:
```bash ```bash
./shufflenet.py --flops ./shufflenet.py --flops
``` ```
It will print about 80Mflops, because the paper counts multiply+add as 1 flop. It will print about 75Mflops, because the paper counts multiply+add as 1 flop.
Train (takes 24 hours on 8 Maxwell TitanX): Train (takes 36 hours on 8 P100s):
```bash ```bash
./shufflenet.py --data /path/to/ilsvrc/ ./shufflenet.py --data /path/to/ilsvrc/
``` ```
Eval the [pretrained model](http://models.tensorpack.com/ShuffleNet/): Evaluate the [pretrained model](http://models.tensorpack.com/ShuffleNet/):
``` ```
./shufflenet.py --eval --data /path/to/ilsvrc --load /path/to/model ./shufflenet.py --eval --data /path/to/ilsvrc --load /path/to/model
``` ```
...@@ -23,7 +23,7 @@ from imagenet_utils import ( ...@@ -23,7 +23,7 @@ from imagenet_utils import (
get_imagenet_dataflow, get_imagenet_dataflow,
ImageNetModel, GoogleNetResize, eval_on_ILSVRC12) ImageNetModel, GoogleNetResize, eval_on_ILSVRC12)
TOTAL_BATCH_SIZE = 256 TOTAL_BATCH_SIZE = 1024
@layer_register(log_shape=True) @layer_register(log_shape=True)
...@@ -48,6 +48,7 @@ def DepthConv(x, out_channel, kernel_shape, padding='SAME', stride=1, ...@@ -48,6 +48,7 @@ def DepthConv(x, out_channel, kernel_shape, padding='SAME', stride=1,
def channel_shuffle(l, group): def channel_shuffle(l, group):
in_shape = l.get_shape().as_list() in_shape = l.get_shape().as_list()
in_channel = in_shape[1] in_channel = in_shape[1]
assert in_channel % group == 0, in_channel
l = tf.reshape(l, [-1, group, in_channel // group] + in_shape[-2:]) l = tf.reshape(l, [-1, group, in_channel // group] + in_shape[-2:])
l = tf.transpose(l, [0, 2, 1, 3, 4]) l = tf.transpose(l, [0, 2, 1, 3, 4])
l = tf.reshape(l, [-1, in_channel] + in_shape[-2:]) l = tf.reshape(l, [-1, in_channel] + in_shape[-2:])
...@@ -69,7 +70,7 @@ class Model(ImageNetModel): ...@@ -69,7 +70,7 @@ class Model(ImageNetModel):
# We do not apply group convolution on the first pointwise layer # We do not apply group convolution on the first pointwise layer
# because the number of input channels is relatively small. # because the number of input channels is relatively small.
first_split = group if in_channel != 16 else 1 first_split = group if in_channel != 12 else 1
l = Conv2D('conv1', l, out_channel // 4, 1, split=first_split, nl=BNReLU) l = Conv2D('conv1', l, out_channel // 4, 1, split=first_split, nl=BNReLU)
l = channel_shuffle(l, group) l = channel_shuffle(l, group)
l = DepthConv('dconv', l, out_channel // 4, 3, nl=BN, stride=stride) l = DepthConv('dconv', l, out_channel // 4, 3, nl=BN, stride=stride)
...@@ -86,10 +87,10 @@ class Model(ImageNetModel): ...@@ -86,10 +87,10 @@ class Model(ImageNetModel):
with argscope([Conv2D, MaxPooling, AvgPooling, GlobalAvgPooling, BatchNorm], data_format=self.data_format), \ with argscope([Conv2D, MaxPooling, AvgPooling, GlobalAvgPooling, BatchNorm], data_format=self.data_format), \
argscope(Conv2D, use_bias=False): argscope(Conv2D, use_bias=False):
group = 8 group = 3
channels = [224, 416, 832] channels = [120, 240, 480]
l = Conv2D('conv1', image, 16, 3, stride=2, nl=BNReLU) l = Conv2D('conv1', image, 12, 3, stride=2, nl=BNReLU)
l = MaxPooling('pool1', l, 3, 2, padding='SAME') l = MaxPooling('pool1', l, 3, 2, padding='SAME')
with tf.variable_scope('group1'): with tf.variable_scope('group1'):
...@@ -98,7 +99,7 @@ class Model(ImageNetModel): ...@@ -98,7 +99,7 @@ class Model(ImageNetModel):
l = shufflenet_unit(l, channels[0], group, 2 if i == 0 else 1) l = shufflenet_unit(l, channels[0], group, 2 if i == 0 else 1)
with tf.variable_scope('group2'): with tf.variable_scope('group2'):
for i in range(6): for i in range(8):
with tf.variable_scope('block{}'.format(i)): with tf.variable_scope('block{}'.format(i)):
l = shufflenet_unit(l, channels[1], group, 2 if i == 0 else 1) l = shufflenet_unit(l, channels[1], group, 2 if i == 0 else 1)
...@@ -148,11 +149,15 @@ def get_config(model, nr_tower): ...@@ -148,11 +149,15 @@ def get_config(model, nr_tower):
logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
dataset_train = get_data('train', batch) dataset_train = get_data('train', batch)
dataset_val = get_data('val', batch) dataset_val = get_data('val', batch)
step_size = 1280000 // TOTAL_BATCH_SIZE
max_iter = 3 * 10**5
max_epoch = (max_iter // step_size) + 1
callbacks = [ callbacks = [
ModelSaver(), ModelSaver(),
ScheduledHyperParamSetter('learning_rate', ScheduledHyperParamSetter('learning_rate',
[(0, 3e-1), (30, 3e-2), (60, 3e-3), (90, 3e-4)]), [(0, 0.5), (max_iter, 0)],
HumanHyperParamSetter('learning_rate'), interp='linear', step_based=True),
] ]
infs = [ClassificationError('wrong-top1', 'val-error-top1'), infs = [ClassificationError('wrong-top1', 'val-error-top1'),
ClassificationError('wrong-top5', 'val-error-top5')] ClassificationError('wrong-top5', 'val-error-top5')]
...@@ -168,8 +173,8 @@ def get_config(model, nr_tower): ...@@ -168,8 +173,8 @@ def get_config(model, nr_tower):
model=model, model=model,
dataflow=dataset_train, dataflow=dataset_train,
callbacks=callbacks, callbacks=callbacks,
steps_per_epoch=5000, steps_per_epoch=step_size,
max_epoch=100, max_epoch=max_epoch,
) )
...@@ -207,8 +212,7 @@ if __name__ == '__main__': ...@@ -207,8 +212,7 @@ if __name__ == '__main__':
cmd='op', cmd='op',
options=tf.profiler.ProfileOptionBuilder.float_operation()) options=tf.profiler.ProfileOptionBuilder.float_operation())
else: else:
logger.set_logger_dir( logger.set_logger_dir(os.path.join('train_log', 'shufflenet'))
os.path.join('train_log', 'shufflenet'))
nr_tower = max(get_nr_gpu(), 1) nr_tower = max(get_nr_gpu(), 1)
config = get_config(model, nr_tower) config = get_config(model, nr_tower)
......
...@@ -246,12 +246,6 @@ class ScheduledHyperParamSetter(HyperParamSetter): ...@@ -246,12 +246,6 @@ class ScheduledHyperParamSetter(HyperParamSetter):
def _get_value_to_set(self): def _get_value_to_set(self):
refnum = self.global_step if self._step else self.epoch_num refnum = self.global_step if self._step else self.epoch_num
if self.interp is None:
for e, v in self.schedule:
if e == refnum:
return v
return None
else:
laste, lastv = None, None laste, lastv = None, None
for e, v in self.schedule: for e, v in self.schedule:
if e == refnum: if e == refnum:
...@@ -262,7 +256,10 @@ class ScheduledHyperParamSetter(HyperParamSetter): ...@@ -262,7 +256,10 @@ class ScheduledHyperParamSetter(HyperParamSetter):
if laste is None or laste == e: if laste is None or laste == e:
# hasn't reached the first scheduled point, or reached the end of all scheduled points # hasn't reached the first scheduled point, or reached the end of all scheduled points
return None return None
if self.interp is not None:
v = (refnum - laste) * 1. / (e - laste) * (v - lastv) + lastv v = (refnum - laste) * 1. / (e - laste) * (v - lastv) + lastv
else:
v = lastv
return v return v
def _trigger_epoch(self): def _trigger_epoch(self):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment