Upgrade shufflenet; fix paramsetter for restore

3e9f164d · Yuxin Wu · 9185744d · 3e9f164d · 3e9f164d · 3e9f164d
Commit 3e9f164d authored Feb 23, 2018 by Yuxin Wu
6 changed files
--- a/examples/DeepQNetwork/DQN.py
+++ b/examples/DeepQNetwork/DQN.py
@@ -54,7 +54,7 @@ class Model(DQNModel):
    def _get_DQN_prediction(self, image):
        """ image: [0,255]"""
        image = image / 255.0
-        with argscope(Conv2D, nl=PReLU.symbolic_function, use_bias=True):
+        with argscope(Conv2D, activation=lambda x: PReLU('prelu', x), use_bias=True):
            l = (LinearWrap(image)
                 # Nature architecture
                 .Conv2D('conv0', out_channel=32, kernel_shape=8, stride=4)

--- a/examples/DeepQNetwork/DQNModel.py
+++ b/examples/DeepQNetwork/DQNModel.py
@@ -15,6 +15,8 @@ assert tensorpack.tfutils.common.get_tf_version_number() >= 1.2
 class Model(ModelDesc):
+    learning_rate = 1e-3
    def __init__(self, image_shape, channel, method, num_actions, gamma):
        self.image_shape = image_shape
        self.channel = channel
@@ -80,7 +82,7 @@ class Model(ModelDesc):
        summary.add_moving_summary(self.cost)
    def _get_optimizer(self):
-        lr = tf.get_variable('learning_rate', initializer=1e-3, trainable=False)
+        lr = tf.get_variable('learning_rate', initializer=self.learning_rate, trainable=False)
        opt = tf.train.AdamOptimizer(lr, epsilon=1e-3)
        return optimizer.apply_grad_processors(
            opt, [gradproc.GlobalNormClip(10), gradproc.SummaryGradient()])

--- a/examples/FasterRCNN/model.py
+++ b/examples/FasterRCNN/model.py
@@ -546,3 +546,35 @@ def maskrcnn_loss(mask_logits, fg_labels, fg_target_masks):
    add_moving_summary(loss, accuracy, fg_pixel_ratio, pos_accuracy)
    return loss
+if __name__ == '__main__':
+    """
+    Demonstrate what's wrong with tf.image.crop_and_resize:
+    """
+    import numpy as np
+    import tensorflow.contrib.eager as tfe
+    tfe.enable_eager_execution()
+    # want to crop 2x2 out of a 5x5 image, and resize to 4x4
+    image = np.arange(25).astype('float32').reshape(5, 5)
+    boxes = np.asarray([[1, 1, 3, 3]], dtype='float32')
+    target = 4
+    print(crop_and_resize(
+        image[None, None, :, :], boxes, [0], target)[0][0])
+    """
+    Expected values:
+    4.5 5 5.5 6
+    7 7.5 8 8.5
+    9.5 10 10.5 11
+    12 12.5 13 13.5
+    Our implementation is not perfect either. When boxes are on the border of
+    images, TF pads zeros instead of border values. But this rarely happens so it's fine.
+    You cannot easily get the above results with tf.image.crop_and_resize.
+    Try out yourself here:
+    """
+    print(tf.image.crop_and_resize(
+        image[None, :, :, None],
+        np.asarray([[1, 1, 2, 2]]) / 4.0, [0], [target, target])[0][:, :, 0])
--- a/examples/ShuffleNet/README.md
+++ b/examples/ShuffleNet/README.md
@@ -4,8 +4,8 @@
 Reproduce [ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices](https://arxiv.org/abs/1707.01083)
 on ImageNet.
-This is a 40Mflops ShuffleNet, corresponding to `ShuffleNet 0.5x (arch2)	g=8` in the paper.
+This is a 38Mflops ShuffleNet, corresponding to `ShuffleNet 0.5x g=3` in [version 2](https://arxiv.org/pdf/1707.01083v2) of the paper.
-After 100 epochs it reaches top-1 error of 42.62, matching the paper's number.
+After 240 epochs it reaches top-1 error of 42.32, better than the paper's number.
 ### Usage:
@@ -13,14 +13,14 @@ Print flops with tensorflow:
 ```bash
 ./shufflenet.py --flops
 ```
-It will print about 80Mflops, because the paper counts multiply+add as 1 flop.
+It will print about 75Mflops, because the paper counts multiply+add as 1 flop.
-Train (takes 24 hours on 8 Maxwell TitanX):
+Train (takes 36 hours on 8 P100s):
 ```bash
 ./shufflenet.py --data /path/to/ilsvrc/
 ```
-Eval the [pretrained model](http://models.tensorpack.com/ShuffleNet/):
+Evaluate the [pretrained model](http://models.tensorpack.com/ShuffleNet/):
 ```
 ./shufflenet.py --eval --data /path/to/ilsvrc --load /path/to/model
 ```
--- a/examples/ShuffleNet/shufflenet.py
+++ b/examples/ShuffleNet/shufflenet.py
@@ -23,7 +23,7 @@ from imagenet_utils import (
    get_imagenet_dataflow,
    ImageNetModel, GoogleNetResize, eval_on_ILSVRC12)
-TOTAL_BATCH_SIZE = 256
+TOTAL_BATCH_SIZE = 1024
 @layer_register(log_shape=True)
@@ -48,6 +48,7 @@ def DepthConv(x, out_channel, kernel_shape, padding='SAME', stride=1,
 def channel_shuffle(l, group):
    in_shape = l.get_shape().as_list()
    in_channel = in_shape[1]
+    assert in_channel % group == 0, in_channel
    l = tf.reshape(l, [-1, group, in_channel // group] + in_shape[-2:])
    l = tf.transpose(l, [0, 2, 1, 3, 4])
    l = tf.reshape(l, [-1, in_channel] + in_shape[-2:])
@@ -69,7 +70,7 @@ class Model(ImageNetModel):
            # We do not apply group convolution on the first pointwise layer
            # because the number of input channels is relatively small.
-            first_split = group if in_channel != 16 else 1
+            first_split = group if in_channel != 12 else 1
            l = Conv2D('conv1', l, out_channel // 4, 1, split=first_split, nl=BNReLU)
            l = channel_shuffle(l, group)
            l = DepthConv('dconv', l, out_channel // 4, 3, nl=BN, stride=stride)
@@ -86,10 +87,10 @@ class Model(ImageNetModel):
        with argscope([Conv2D, MaxPooling, AvgPooling, GlobalAvgPooling, BatchNorm], data_format=self.data_format), \
                argscope(Conv2D, use_bias=False):
-            group = 8
+            group = 3
-            channels = [224, 416, 832]
+            channels = [120, 240, 480]
-            l = Conv2D('conv1', image, 16, 3, stride=2, nl=BNReLU)
+            l = Conv2D('conv1', image, 12, 3, stride=2, nl=BNReLU)
            l = MaxPooling('pool1', l, 3, 2, padding='SAME')
            with tf.variable_scope('group1'):
@@ -98,7 +99,7 @@ class Model(ImageNetModel):
                        l = shufflenet_unit(l, channels[0], group, 2 if i == 0 else 1)
            with tf.variable_scope('group2'):
-                for i in range(6):
+                for i in range(8):
                    with tf.variable_scope('block{}'.format(i)):
                        l = shufflenet_unit(l, channels[1], group, 2 if i == 0 else 1)
@@ -148,11 +149,15 @@ def get_config(model, nr_tower):
    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)
+    step_size = 1280000 // TOTAL_BATCH_SIZE
+    max_iter = 3 * 10**5
+    max_epoch = (max_iter // step_size) + 1
    callbacks = [
        ModelSaver(),
        ScheduledHyperParamSetter('learning_rate',
-                                  [(0, 3e-1), (30, 3e-2), (60, 3e-3), (90, 3e-4)]),
+                                  [(0, 0.5), (max_iter, 0)],
-        HumanHyperParamSetter('learning_rate'),
+                                  interp='linear', step_based=True),
    ]
    infs = [ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')]
@@ -168,8 +173,8 @@ def get_config(model, nr_tower):
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
-        steps_per_epoch=5000,
+        steps_per_epoch=step_size,
-        max_epoch=100,
+        max_epoch=max_epoch,
    )
@@ -207,8 +212,7 @@ if __name__ == '__main__':
            cmd='op',
            options=tf.profiler.ProfileOptionBuilder.float_operation())
    else:
-        logger.set_logger_dir(
+        logger.set_logger_dir(os.path.join('train_log', 'shufflenet'))
-            os.path.join('train_log', 'shufflenet'))
        nr_tower = max(get_nr_gpu(), 1)
        config = get_config(model, nr_tower)

--- a/tensorpack/callbacks/param.py
+++ b/tensorpack/callbacks/param.py
@@ -246,12 +246,6 @@ class ScheduledHyperParamSetter(HyperParamSetter):
    def _get_value_to_set(self):
        refnum = self.global_step if self._step else self.epoch_num
-        if self.interp is None:
-            for e, v in self.schedule:
-                if e == refnum:
-                    return v
-            return None
-        else:
        laste, lastv = None, None
        for e, v in self.schedule:
            if e == refnum:
@@ -262,7 +256,10 @@ class ScheduledHyperParamSetter(HyperParamSetter):
        if laste is None or laste == e:
            # hasn't reached the first scheduled point, or reached the end of all scheduled points
            return None
+        if self.interp is not None:
            v = (refnum - laste) * 1. / (e - laste) * (v - lastv) + lastv
+        else:
+            v = lastv
        return v
    def _trigger_epoch(self):