Commit e9caf75e authored by Yuxin Wu's avatar Yuxin Wu

STN readme

parent a2cb06fb
...@@ -10,6 +10,7 @@ You can actually train them and reproduce the performance... not just see how to ...@@ -10,6 +10,7 @@ You can actually train them and reproduce the performance... not just see how to
+ [IncpetionV3 on ImageNet](examples/Inception/inceptionv3.py) + [IncpetionV3 on ImageNet](examples/Inception/inceptionv3.py)
+ [ResNet for Cifar10 classification](examples/ResNet) + [ResNet for Cifar10 classification](examples/ResNet)
+ [Fully-convolutional Network for Holistically-Nested Edge Detection](examples/HED) + [Fully-convolutional Network for Holistically-Nested Edge Detection](examples/HED)
+ [Spatial Transformer Networks on MNIST addition](examples/SpatialTransformer)
+ [Double-DQN plays Atari games](examples/Atari2600) + [Double-DQN plays Atari games](examples/Atari2600)
+ [Batch-A3C plays Atari games with demos on OpenAI Gym](examples/OpenAIGym) + [Batch-A3C plays Atari games with demos on OpenAI Gym](examples/OpenAIGym)
+ [char-rnn language model](examples/char-rnn) + [char-rnn language model](examples/char-rnn)
......
...@@ -5,11 +5,12 @@ Training examples with __reproducible__ and meaningful performance. ...@@ -5,11 +5,12 @@ Training examples with __reproducible__ and meaningful performance.
+ [An illustrative mnist example with explanation of the framework](mnist-convnet.py) + [An illustrative mnist example with explanation of the framework](mnist-convnet.py)
+ [A tiny SVHN ConvNet with 97.5% accuracy](svhn-digit-convnet.py) + [A tiny SVHN ConvNet with 97.5% accuracy](svhn-digit-convnet.py)
+ Reinforcement learning (DQN, A3C) on [Atari games](Atari2600) and [demos on OpenAI Gym](OpenAIGym).
+ [char-rnn for fun](char-rnn)
+ [DisturbLabel, because I don't believe the paper](DisturbLabel)
+ [DoReFa-Net: binary / low-bitwidth CNN on ImageNet](DoReFa-Net)
+ [Inception-BN with 71% accuracy](Inception/inception-bn.py) + [Inception-BN with 71% accuracy](Inception/inception-bn.py)
+ [InceptionV3 with 74.5% accuracy (similar to the official code)](Inception/inceptionv3.py) + [InceptionV3 with 74.5% accuracy (similar to the official code)](Inception/inceptionv3.py)
+ [DoReFa-Net: binary / low-bitwidth CNN on ImageNet](DoReFa-Net)
+ [ResNet for Cifar10 and SVHN](ResNet) + [ResNet for Cifar10 and SVHN](ResNet)
+ [Holistically-Nested Edge Detection](HED) + [Holistically-Nested Edge Detection](HED)
+ [Spatial Transformer Networks on MNIST addition](SpatialTransformer)
+ [DisturbLabel, because I don't believe the paper](DisturbLabel)
+ Reinforcement learning (DQN, A3C) on [Atari games](Atari2600) and [demos on OpenAI Gym](OpenAIGym).
+ [char-rnn for fun](char-rnn)
## Spatial Transformer Network
Reproduce the "MNIST addition" experiments in
[Spatial Transformer Networks](https://arxiv.org/abs/1506.02025)
by Max Jaderberg, et al.
Given an image of two distorted MNIST digits stacked in two channels, the network is trained to
produce the sum of them.
Here the two Spatial Transformer branches learn to localize the two digits
and warped them separately.
![demo](demo.jpg)
(Left: input image; Middle: output of the first STN branch (which localizes the second digit); Right: output of the second STN branch.)
To train (takes about 300 epochs to reach 8.8% error):
```bash
./mnist-addition.py
```
To draw the above visualization with [pretrained model](https://drive.google.com/drive/folders/0B9IPQTvr2BBkUWM3X0hDZHJtTmc?usp=sharing):
```bash
./mnist-addition.py --load pretrained.npy --view
```
...@@ -65,7 +65,7 @@ class Model(ModelDesc): ...@@ -65,7 +65,7 @@ class Model(ModelDesc):
tf.expand_dims(stacked, -1), max_images=30) tf.expand_dims(stacked, -1), max_images=30)
sampled = tf.concat(3, [sampled1, sampled2], 'sampled_concat') sampled = tf.concat(3, [sampled1, sampled2], 'sampled_concat')
logits = (LinearWrap(sampled) # the starting brace is only for line-breaking logits = (LinearWrap(sampled)
.apply(symbf.batch_flatten) .apply(symbf.batch_flatten)
.FullyConnected('fc1', out_dim=256, nl=tf.nn.relu) .FullyConnected('fc1', out_dim=256, nl=tf.nn.relu)
.FullyConnected('fc2', out_dim=128, nl=tf.nn.relu) .FullyConnected('fc2', out_dim=128, nl=tf.nn.relu)
...@@ -86,7 +86,7 @@ class Model(ModelDesc): ...@@ -86,7 +86,7 @@ class Model(ModelDesc):
def get_gradient_processor(self): def get_gradient_processor(self):
return [MapGradient(lambda grad: tf.clip_by_global_norm([grad], 5)[0][0]), return [MapGradient(lambda grad: tf.clip_by_global_norm([grad], 5)[0][0]),
ScaleGradient([('STN.*', 0.3)]), SummaryGradient()] ScaleGradient([('STN.*', 0.1)]), SummaryGradient()]
def get_data(isTrain): def get_data(isTrain):
ds = dataset.Mnist('train' if isTrain else 'test') ds = dataset.Mnist('train' if isTrain else 'test')
...@@ -133,8 +133,9 @@ def view_warp(modelpath): ...@@ -133,8 +133,9 @@ def view_warp(modelpath):
outputs, affine1, affine2 = pred([img]) outputs, affine1, affine2 = pred([img])
for idx, viz in enumerate(outputs): for idx, viz in enumerate(outputs):
viz = cv2.cvtColor(viz, cv2.COLOR_GRAY2BGR) viz = cv2.cvtColor(viz, cv2.COLOR_GRAY2BGR)
draw_rect(viz, affine1[idx], (0,0,255)) # Here we assume the second branch focuses on the first digit
draw_rect(viz, affine2[idx], (0,0,255), offset=[IMAGE_SIZE, 0]) draw_rect(viz, affine2[idx], (0,0,255))
draw_rect(viz, affine1[idx], (0,0,255), offset=[IMAGE_SIZE, 0])
cv2.imwrite('{:03d}.png'.format(idx), (viz + 0.5) * 255) cv2.imwrite('{:03d}.png'.format(idx), (viz + 0.5) * 255)
break break
...@@ -149,12 +150,12 @@ def get_config(): ...@@ -149,12 +150,12 @@ def get_config():
return TrainConfig( return TrainConfig(
dataset=dataset_train, dataset=dataset_train,
optimizer=tf.train.AdamOptimizer(lr), optimizer=tf.train.AdamOptimizer(lr, epsilon=1e-3),
callbacks=Callbacks([ callbacks=Callbacks([
StatPrinter(), ModelSaver(), StatPrinter(), ModelSaver(),
InferenceRunner(dataset_test, InferenceRunner(dataset_test,
[ScalarStats('cost'), ClassificationError() ]), [ScalarStats('cost'), ClassificationError() ]),
ScheduledHyperParamSetter('learning_rate', [(200, 1e-4), (400, 8e-5)]) ScheduledHyperParamSetter('learning_rate', [(200, 1e-4)])
]), ]),
session_config=get_default_sess_config(0.5), session_config=get_default_sess_config(0.5),
model=Model(), model=Model(),
...@@ -172,10 +173,9 @@ if __name__ == '__main__': ...@@ -172,10 +173,9 @@ if __name__ == '__main__':
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
if args.view: if args.view:
view_warp(args.load) view_warp(args.load)
sys.exit() else:
config = get_config()
config = get_config() if args.load:
if args.load: config.session_init = SaverRestore(args.load)
config.session_init = SaverRestore(args.load) SimpleTrainer(config).train()
SimpleTrainer(config).train()
...@@ -151,7 +151,8 @@ if __name__ == '__main__': ...@@ -151,7 +151,8 @@ if __name__ == '__main__':
config = get_config(args.classnum) config = get_config(args.classnum)
if args.load: if args.load:
config.session_init = SaverRestore(args.load) config.session_init = SaverRestore(args.load)
if args.gpu:
config.nr_tower = len(args.gpu.split(','))
QueueInputTrainer(config).train() QueueInputTrainer(config).train()
#if args.gpu:
#config.nr_tower = len(args.gpu.split(','))
#AsyncMultiGPUTrainer(config).train() #AsyncMultiGPUTrainer(config).train()
...@@ -43,6 +43,7 @@ class Model(ModelDesc): ...@@ -43,6 +43,7 @@ class Model(ModelDesc):
l = Conv2D('conv5', l, out_channel=256, kernel_shape=3, split=2) l = Conv2D('conv5', l, out_channel=256, kernel_shape=3, split=2)
l = MaxPooling('pool3', l, 3, stride=2, padding='VALID') l = MaxPooling('pool3', l, 3, stride=2, padding='VALID')
# This is just a script to load model, so we ignore the dropout layer
l = FullyConnected('fc6', l, 4096) l = FullyConnected('fc6', l, 4096)
l = FullyConnected('fc7', l, out_dim=4096) l = FullyConnected('fc7', l, out_dim=4096)
# fc will have activation summary by default. disable this for the output layer # fc will have activation summary by default. disable this for the output layer
......
...@@ -123,6 +123,4 @@ if __name__ == '__main__': ...@@ -123,6 +123,4 @@ if __name__ == '__main__':
config = get_config() config = get_config()
if args.load: if args.load:
config.session_init = SaverRestore(args.load) config.session_init = SaverRestore(args.load)
if args.gpu:
config.nr_tower = len(args.gpu.split(','))
QueueInputTrainer(config).train() QueueInputTrainer(config).train()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: viz.py
# Author: Yuxin Wu <ppwwyyxxc@gmail.com>
import numpy as np
def minnone(x, y):
if x is None: x = y
elif y is None: y = x
return min(x, y)
def build_patch_list(patch_list,
nr_row=None, nr_col=None, border=5,
max_width=1000, max_height=1000,
shuffle=False, bgcolor=255):
"""
patch_list: bhw or bhwc
"""
patch_list = np.asarray(patch_list)
if patch_list.ndim == 3:
patch_list = patch_list[:,:,:,np.newaxis]
assert patch_list.ndim == 4 and patch_list.shape[3] in [1, 3], patch_list.shape
if shuffle:
np.random.shuffle(patch_list)
ph, pw = patch_list.shape[1:3]
mh, mw = max(max_height, ph + border), max(max_width, pw + border)
nr_row = minnone(nr_row, max_height / (ph + border))
nr_col = minnone(nr_col, max_width / (pw + border))
canvas = np.zeros((nr_row * (ph + border) - border,
nr_col * (pw + border) - border,
patch_list.shape[3]), dtype='uint8')
def draw_patch(plist):
cur_row, cur_col = 0, 0
canvas.fill(bgcolor)
for patch in plist:
r0 = cur_row * (ph + border)
c0 = cur_col * (pw + border)
canvas[r0:r0+ph, c0:c0+pw] = patch
cur_col += 1
if cur_col == nr_col:
cur_col = 0
cur_row += 1
nr_patch = nr_row * nr_col
start = 0
while True:
end = start + nr_patch
cur_list = patch_list[start:end]
if not len(cur_list):
return
draw_patch(cur_list)
yield canvas
start = end
if __name__ == '__main__':
import cv2
imglist = []
for i in range(100):
fname = "{:03d}.png".format(i)
imglist.append(cv2.imread(fname))
for idx, patch in enumerate(build_patch_list(
imglist, max_width=500, max_height=200)):
of = "patch{:02d}.png".format(idx)
cv2.imwrite(of, patch)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment