STN readme

e9caf75e · Yuxin Wu · a2cb06fb · e9caf75e · e9caf75e · e9caf75e
Commit e9caf75e authored Oct 19, 2016 by Yuxin Wu
9 changed files
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ You can actually train them and reproduce the performance... not just see how to
 + [IncpetionV3 on ImageNet](examples/Inception/inceptionv3.py)
 + [ResNet for Cifar10 classification](examples/ResNet)
 + [Fully-convolutional Network for Holistically-Nested Edge Detection](examples/HED)
+ [Spatial Transformer Networks on MNIST addition](examples/SpatialTransformer)
 + [Double-DQN plays Atari games](examples/Atari2600)
 + [Batch-A3C plays Atari games with demos on OpenAI Gym](examples/OpenAIGym)
 + [char-rnn language model](examples/char-rnn)

--- a/examples/README.md
+++ b/examples/README.md
@@ -5,11 +5,12 @@ Training examples with __reproducible__ and meaningful performance.

 + [An illustrative mnist example with explanation of the framework](mnist-convnet.py)
 + [A tiny SVHN ConvNet with 97.5% accuracy](svhn-digit-convnet.py)
-+ Reinforcement learning (DQN, A3C) on [Atari games](Atari2600) and [demos on OpenAI Gym](OpenAIGym).
-+ [char-rnn for fun](char-rnn)
-+ [DisturbLabel, because I don't believe the paper](DisturbLabel)
-+ [DoReFa-Net: binary / low-bitwidth CNN on ImageNet](DoReFa-Net)
 + [Inception-BN with 71% accuracy](Inception/inception-bn.py)
 + [InceptionV3 with 74.5% accuracy (similar to the official code)](Inception/inceptionv3.py)
+ [DoReFa-Net: binary / low-bitwidth CNN on ImageNet](DoReFa-Net)
 + [ResNet for Cifar10 and SVHN](ResNet)
 + [Holistically-Nested Edge Detection](HED)
+ [Spatial Transformer Networks on MNIST addition](SpatialTransformer)
+ [DisturbLabel, because I don't believe the paper](DisturbLabel)
+ Reinforcement learning (DQN, A3C) on [Atari games](Atari2600) and [demos on OpenAI Gym](OpenAIGym).
+ [char-rnn for fun](char-rnn)
--- a/examples/SpatialTransformer/README.md
+++ b/examples/SpatialTransformer/README.md
+## Spatial Transformer Network
+
+Reproduce the "MNIST addition" experiments in
+[Spatial Transformer Networks](https://arxiv.org/abs/1506.02025)
+by Max Jaderberg, et al.
+
+Given an image of two distorted MNIST digits stacked in two channels, the network is trained to
+produce the sum of them.
+Here the two Spatial Transformer branches learn to localize the two digits
+and warped them separately.
+
+![demo](demo.jpg)
+(Left: input image; Middle: output of the first STN branch (which localizes the second digit); Right: output of the second STN branch.)
+
+To train (takes about 300 epochs to reach 8.8% error):
+```bash
+./mnist-addition.py
+```
+
+To draw the above visualization with [pretrained model](https://drive.google.com/drive/folders/0B9IPQTvr2BBkUWM3X0hDZHJtTmc?usp=sharing):
+```bash
+./mnist-addition.py --load pretrained.npy --view
+```
--- a/examples/SpatialTransformer/demo.jpg
+++ b/examples/SpatialTransformer/demo.jpg
--- a/examples/SpatialTransformer/mnist-addition.py
+++ b/examples/SpatialTransformer/mnist-addition.py
@@ -65,7 +65,7 @@ class Model(ModelDesc):
                tf.expand_dims(stacked, -1), max_images=30)

        sampled = tf.concat(3, [sampled1, sampled2], 'sampled_concat')
-        logits = (LinearWrap(sampled) # the starting brace is only for line-breaking
+        logits = (LinearWrap(sampled)
                .apply(symbf.batch_flatten)
                .FullyConnected('fc1', out_dim=256, nl=tf.nn.relu)
                .FullyConnected('fc2', out_dim=128, nl=tf.nn.relu)
@@ -86,7 +86,7 @@ class Model(ModelDesc):

    def get_gradient_processor(self):
        return [MapGradient(lambda grad: tf.clip_by_global_norm([grad], 5)[0][0]),
-                ScaleGradient([('STN.*', 0.3)]), SummaryGradient()]
+                ScaleGradient([('STN.*', 0.1)]), SummaryGradient()]

 def get_data(isTrain):
    ds = dataset.Mnist('train' if isTrain else 'test')
@@ -133,8 +133,9 @@ def view_warp(modelpath):
        outputs, affine1, affine2 = pred([img])
        for idx, viz in enumerate(outputs):
            viz = cv2.cvtColor(viz, cv2.COLOR_GRAY2BGR)
-            draw_rect(viz, affine1[idx], (0,0,255))
-            draw_rect(viz, affine2[idx], (0,0,255), offset=[IMAGE_SIZE, 0])
+            # Here we assume the second branch focuses on the first digit
+            draw_rect(viz, affine2[idx], (0,0,255))
+            draw_rect(viz, affine1[idx], (0,0,255), offset=[IMAGE_SIZE, 0])
            cv2.imwrite('{:03d}.png'.format(idx), (viz + 0.5) * 255)
        break

@@ -149,12 +150,12 @@ def get_config():

    return TrainConfig(
        dataset=dataset_train,
-        optimizer=tf.train.AdamOptimizer(lr),
+        optimizer=tf.train.AdamOptimizer(lr, epsilon=1e-3),
        callbacks=Callbacks([
            StatPrinter(), ModelSaver(),
            InferenceRunner(dataset_test,
                [ScalarStats('cost'), ClassificationError() ]),
-            ScheduledHyperParamSetter('learning_rate', [(200, 1e-4), (400, 8e-5)])
+            ScheduledHyperParamSetter('learning_rate', [(200, 1e-4)])
        ]),
        session_config=get_default_sess_config(0.5),
        model=Model(),
@@ -172,10 +173,9 @@ if __name__ == '__main__':
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    if args.view:
        view_warp(args.load)
-        sys.exit()
-
-    config = get_config()
-    if args.load:
-        config.session_init = SaverRestore(args.load)
-    SimpleTrainer(config).train()
+    else:
+        config = get_config()
+        if args.load:
+            config.session_init = SaverRestore(args.load)
+        SimpleTrainer(config).train()

--- a/examples/cifar-convnet.py
+++ b/examples/cifar-convnet.py
@@ -151,7 +151,8 @@ if __name__ == '__main__':
        config = get_config(args.classnum)
        if args.load:
            config.session_init = SaverRestore(args.load)
-        if args.gpu:
-            config.nr_tower = len(args.gpu.split(','))
+
        QueueInputTrainer(config).train()
+        #if args.gpu:
+            #config.nr_tower = len(args.gpu.split(','))
        #AsyncMultiGPUTrainer(config).train()
--- a/examples/load-alexnet.py
+++ b/examples/load-alexnet.py
@@ -43,6 +43,7 @@ class Model(ModelDesc):
            l = Conv2D('conv5', l, out_channel=256, kernel_shape=3, split=2)
            l = MaxPooling('pool3', l, 3, stride=2, padding='VALID')

+            # This is just a script to load model, so we ignore the dropout layer
            l = FullyConnected('fc6', l, 4096)
            l = FullyConnected('fc7', l, out_dim=4096)
        # fc will have activation summary by default. disable this for the output layer

--- a/examples/svhn-digit-convnet.py
+++ b/examples/svhn-digit-convnet.py
@@ -123,6 +123,4 @@ if __name__ == '__main__':
        config = get_config()
        if args.load:
            config.session_init = SaverRestore(args.load)
-        if args.gpu:
-            config.nr_tower = len(args.gpu.split(','))
        QueueInputTrainer(config).train()
--- a/tensorpack/utils/viz.py
+++ b/tensorpack/utils/viz.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# File: viz.py
+# Author: Yuxin Wu <ppwwyyxxc@gmail.com>
+
+import numpy as np
+
+def minnone(x, y):
+    if x is None: x = y
+    elif y is None: y = x
+    return min(x, y)
+
+def build_patch_list(patch_list,
+        nr_row=None, nr_col=None, border=5,
+        max_width=1000, max_height=1000,
+        shuffle=False, bgcolor=255):
+    """
+    patch_list: bhw or bhwc
+    """
+    patch_list = np.asarray(patch_list)
+    if patch_list.ndim == 3:
+        patch_list = patch_list[:,:,:,np.newaxis]
+    assert patch_list.ndim == 4 and patch_list.shape[3] in [1, 3], patch_list.shape
+    if shuffle:
+        np.random.shuffle(patch_list)
+    ph, pw = patch_list.shape[1:3]
+    mh, mw = max(max_height, ph + border), max(max_width, pw + border)
+    nr_row = minnone(nr_row, max_height / (ph + border))
+    nr_col = minnone(nr_col, max_width / (pw + border))
+
+    canvas = np.zeros((nr_row * (ph + border) - border,
+             nr_col * (pw + border) - border,
+             patch_list.shape[3]), dtype='uint8')
+
+    def draw_patch(plist):
+        cur_row, cur_col = 0, 0
+        canvas.fill(bgcolor)
+        for patch in plist:
+            r0 = cur_row * (ph + border)
+            c0 = cur_col * (pw + border)
+            canvas[r0:r0+ph, c0:c0+pw] = patch
+            cur_col += 1
+            if cur_col == nr_col:
+                cur_col = 0
+                cur_row += 1
+
+    nr_patch = nr_row * nr_col
+    start = 0
+    while True:
+        end = start + nr_patch
+        cur_list = patch_list[start:end]
+        if not len(cur_list):
+            return
+        draw_patch(cur_list)
+        yield canvas
+        start = end
+
+if __name__ == '__main__':
+    import cv2
+    imglist = []
+    for i in range(100):
+        fname = "{:03d}.png".format(i)
+        imglist.append(cv2.imread(fname))
+    for idx, patch in enumerate(build_patch_list(
+            imglist, max_width=500, max_height=200)):
+        of = "patch{:02d}.png".format(idx)
+        cv2.imwrite(of, patch)