Commit 3e9f164d authored by Yuxin Wu's avatar Yuxin Wu

Upgrade shufflenet; fix paramsetter for restore

parent 9185744d
......@@ -54,7 +54,7 @@ class Model(DQNModel):
def _get_DQN_prediction(self, image):
""" image: [0,255]"""
image = image / 255.0
with argscope(Conv2D, nl=PReLU.symbolic_function, use_bias=True):
with argscope(Conv2D, activation=lambda x: PReLU('prelu', x), use_bias=True):
l = (LinearWrap(image)
# Nature architecture
.Conv2D('conv0', out_channel=32, kernel_shape=8, stride=4)
......
......@@ -15,6 +15,8 @@ assert tensorpack.tfutils.common.get_tf_version_number() >= 1.2
class Model(ModelDesc):
learning_rate = 1e-3
def __init__(self, image_shape, channel, method, num_actions, gamma):
self.image_shape = image_shape
self.channel = channel
......@@ -80,7 +82,7 @@ class Model(ModelDesc):
summary.add_moving_summary(self.cost)
def _get_optimizer(self):
lr = tf.get_variable('learning_rate', initializer=1e-3, trainable=False)
lr = tf.get_variable('learning_rate', initializer=self.learning_rate, trainable=False)
opt = tf.train.AdamOptimizer(lr, epsilon=1e-3)
return optimizer.apply_grad_processors(
opt, [gradproc.GlobalNormClip(10), gradproc.SummaryGradient()])
......
......@@ -546,3 +546,35 @@ def maskrcnn_loss(mask_logits, fg_labels, fg_target_masks):
add_moving_summary(loss, accuracy, fg_pixel_ratio, pos_accuracy)
return loss
if __name__ == '__main__':
"""
Demonstrate what's wrong with tf.image.crop_and_resize:
"""
import numpy as np
import tensorflow.contrib.eager as tfe
tfe.enable_eager_execution()
# want to crop 2x2 out of a 5x5 image, and resize to 4x4
image = np.arange(25).astype('float32').reshape(5, 5)
boxes = np.asarray([[1, 1, 3, 3]], dtype='float32')
target = 4
print(crop_and_resize(
image[None, None, :, :], boxes, [0], target)[0][0])
"""
Expected values:
4.5 5 5.5 6
7 7.5 8 8.5
9.5 10 10.5 11
12 12.5 13 13.5
Our implementation is not perfect either. When boxes are on the border of
images, TF pads zeros instead of border values. But this rarely happens so it's fine.
You cannot easily get the above results with tf.image.crop_and_resize.
Try out yourself here:
"""
print(tf.image.crop_and_resize(
image[None, :, :, None],
np.asarray([[1, 1, 2, 2]]) / 4.0, [0], [target, target])[0][:, :, 0])
......@@ -4,8 +4,8 @@
Reproduce [ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices](https://arxiv.org/abs/1707.01083)
on ImageNet.
This is a 40Mflops ShuffleNet, corresponding to `ShuffleNet 0.5x (arch2) g=8` in the paper.
After 100 epochs it reaches top-1 error of 42.62, matching the paper's number.
This is a 38Mflops ShuffleNet, corresponding to `ShuffleNet 0.5x g=3` in [version 2](https://arxiv.org/pdf/1707.01083v2) of the paper.
After 240 epochs it reaches top-1 error of 42.32, better than the paper's number.
### Usage:
......@@ -13,14 +13,14 @@ Print flops with tensorflow:
```bash
./shufflenet.py --flops
```
It will print about 80Mflops, because the paper counts multiply+add as 1 flop.
It will print about 75Mflops, because the paper counts multiply+add as 1 flop.
Train (takes 24 hours on 8 Maxwell TitanX):
Train (takes 36 hours on 8 P100s):
```bash
./shufflenet.py --data /path/to/ilsvrc/
```
Eval the [pretrained model](http://models.tensorpack.com/ShuffleNet/):
Evaluate the [pretrained model](http://models.tensorpack.com/ShuffleNet/):
```
./shufflenet.py --eval --data /path/to/ilsvrc --load /path/to/model
```
......@@ -23,7 +23,7 @@ from imagenet_utils import (
get_imagenet_dataflow,
ImageNetModel, GoogleNetResize, eval_on_ILSVRC12)
TOTAL_BATCH_SIZE = 256
TOTAL_BATCH_SIZE = 1024
@layer_register(log_shape=True)
......@@ -48,6 +48,7 @@ def DepthConv(x, out_channel, kernel_shape, padding='SAME', stride=1,
def channel_shuffle(l, group):
in_shape = l.get_shape().as_list()
in_channel = in_shape[1]
assert in_channel % group == 0, in_channel
l = tf.reshape(l, [-1, group, in_channel // group] + in_shape[-2:])
l = tf.transpose(l, [0, 2, 1, 3, 4])
l = tf.reshape(l, [-1, in_channel] + in_shape[-2:])
......@@ -69,7 +70,7 @@ class Model(ImageNetModel):
# We do not apply group convolution on the first pointwise layer
# because the number of input channels is relatively small.
first_split = group if in_channel != 16 else 1
first_split = group if in_channel != 12 else 1
l = Conv2D('conv1', l, out_channel // 4, 1, split=first_split, nl=BNReLU)
l = channel_shuffle(l, group)
l = DepthConv('dconv', l, out_channel // 4, 3, nl=BN, stride=stride)
......@@ -86,10 +87,10 @@ class Model(ImageNetModel):
with argscope([Conv2D, MaxPooling, AvgPooling, GlobalAvgPooling, BatchNorm], data_format=self.data_format), \
argscope(Conv2D, use_bias=False):
group = 8
channels = [224, 416, 832]
group = 3
channels = [120, 240, 480]
l = Conv2D('conv1', image, 16, 3, stride=2, nl=BNReLU)
l = Conv2D('conv1', image, 12, 3, stride=2, nl=BNReLU)
l = MaxPooling('pool1', l, 3, 2, padding='SAME')
with tf.variable_scope('group1'):
......@@ -98,7 +99,7 @@ class Model(ImageNetModel):
l = shufflenet_unit(l, channels[0], group, 2 if i == 0 else 1)
with tf.variable_scope('group2'):
for i in range(6):
for i in range(8):
with tf.variable_scope('block{}'.format(i)):
l = shufflenet_unit(l, channels[1], group, 2 if i == 0 else 1)
......@@ -148,11 +149,15 @@ def get_config(model, nr_tower):
logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
dataset_train = get_data('train', batch)
dataset_val = get_data('val', batch)
step_size = 1280000 // TOTAL_BATCH_SIZE
max_iter = 3 * 10**5
max_epoch = (max_iter // step_size) + 1
callbacks = [
ModelSaver(),
ScheduledHyperParamSetter('learning_rate',
[(0, 3e-1), (30, 3e-2), (60, 3e-3), (90, 3e-4)]),
HumanHyperParamSetter('learning_rate'),
[(0, 0.5), (max_iter, 0)],
interp='linear', step_based=True),
]
infs = [ClassificationError('wrong-top1', 'val-error-top1'),
ClassificationError('wrong-top5', 'val-error-top5')]
......@@ -168,8 +173,8 @@ def get_config(model, nr_tower):
model=model,
dataflow=dataset_train,
callbacks=callbacks,
steps_per_epoch=5000,
max_epoch=100,
steps_per_epoch=step_size,
max_epoch=max_epoch,
)
......@@ -207,8 +212,7 @@ if __name__ == '__main__':
cmd='op',
options=tf.profiler.ProfileOptionBuilder.float_operation())
else:
logger.set_logger_dir(
os.path.join('train_log', 'shufflenet'))
logger.set_logger_dir(os.path.join('train_log', 'shufflenet'))
nr_tower = max(get_nr_gpu(), 1)
config = get_config(model, nr_tower)
......
......@@ -246,12 +246,6 @@ class ScheduledHyperParamSetter(HyperParamSetter):
def _get_value_to_set(self):
refnum = self.global_step if self._step else self.epoch_num
if self.interp is None:
for e, v in self.schedule:
if e == refnum:
return v
return None
else:
laste, lastv = None, None
for e, v in self.schedule:
if e == refnum:
......@@ -262,7 +256,10 @@ class ScheduledHyperParamSetter(HyperParamSetter):
if laste is None or laste == e:
# hasn't reached the first scheduled point, or reached the end of all scheduled points
return None
if self.interp is not None:
v = (refnum - laste) * 1. / (e - laste) * (v - lastv) + lastv
else:
v = lastv
return v
def _trigger_epoch(self):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment