migrate all examples to use tf.layers naming convention (#627)

e44e9c04 · Yuxin Wu · fe33c833 · e44e9c04 · e44e9c04 · e44e9c04
Commit e44e9c04 authored Mar 13, 2018 by Yuxin Wu
16 changed files
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ It's Yet Another TF wrapper, but different in:

 1. Focus on __training speed__.
 	+	Speed comes for free with tensorpack -- it uses TensorFlow in the __efficient way__ with no extra overhead.
-	  On different CNNs, it runs [1.1~3.5x faster](https://github.com/tensorpack/benchmarks/tree/master/other-wrappers) than the equivalent Keras code.
+	  On different CNNs, it runs [1.2~4x faster](https://github.com/tensorpack/benchmarks/tree/master/other-wrappers) than the equivalent Keras code.

 	+ Data-parallel multi-GPU training is off-the-shelf to use. It scales as well as Google's [official benchmark](https://www.tensorflow.org/performance/benchmarks).


--- a/examples/FasterRCNN/basemodel.py
+++ b/examples/FasterRCNN/basemodel.py
@@ -24,9 +24,9 @@ def maybe_freeze_affine(getter, *args, **kwargs):

 @contextmanager
 def resnet_argscope():
-    with argscope([Conv2D, MaxPooling, BatchNorm], data_format='NCHW'), \
+    with argscope([Conv2D, MaxPooling, BatchNorm], data_format='channels_first'), \
            argscope(Conv2D, use_bias=False), \
-            argscope(BatchNorm, use_local_stat=False), \
+            argscope(BatchNorm, training=False), \
            custom_getter_scope(maybe_freeze_affine):
        yield

@@ -50,36 +50,36 @@ def image_preprocess(image, bgr=True):

 def get_bn(zero_init=False):
    if zero_init:
-        return lambda x, name: BatchNorm('bn', x, gamma_init=tf.zeros_initializer())
+        return lambda x, name=None: BatchNorm('bn', x, gamma_init=tf.zeros_initializer())
    else:
-        return lambda x, name: BatchNorm('bn', x)
+        return lambda x, name=None: BatchNorm('bn', x)


-def resnet_shortcut(l, n_out, stride, nl=tf.identity):
+def resnet_shortcut(l, n_out, stride, activation=tf.identity):
    data_format = get_arg_scope()['Conv2D']['data_format']
-    n_in = l.get_shape().as_list()[1 if data_format == 'NCHW' else 3]
+    n_in = l.get_shape().as_list()[1 if data_format in ['NCHW', 'channels_first'] else 3]
    if n_in != n_out:   # change dimension when channel is not the same
        if stride == 2:
            l = l[:, :, :-1, :-1]
            return Conv2D('convshortcut', l, n_out, 1,
-                          stride=stride, padding='VALID', nl=nl)
+                          strides=stride, padding='VALID', activation=activation)
        else:
            return Conv2D('convshortcut', l, n_out, 1,
-                          stride=stride, nl=nl)
+                          strides=stride, activation=activation)
    else:
        return l


 def resnet_bottleneck(l, ch_out, stride):
    l, shortcut = l, l
-    l = Conv2D('conv1', l, ch_out, 1, nl=BNReLU)
+    l = Conv2D('conv1', l, ch_out, 1, activation=BNReLU)
    if stride == 2:
        l = tf.pad(l, [[0, 0], [0, 0], [0, 1], [0, 1]])
-        l = Conv2D('conv2', l, ch_out, 3, stride=2, nl=BNReLU, padding='VALID')
+        l = Conv2D('conv2', l, ch_out, 3, strides=2, activation=BNReLU, padding='VALID')
    else:
-        l = Conv2D('conv2', l, ch_out, 3, stride=stride, nl=BNReLU)
-    l = Conv2D('conv3', l, ch_out * 4, 1, nl=get_bn(zero_init=True))
-    return l + resnet_shortcut(shortcut, ch_out * 4, stride, nl=get_bn(zero_init=False))
+        l = Conv2D('conv2', l, ch_out, 3, strides=stride, activation=BNReLU)
+    l = Conv2D('conv3', l, ch_out * 4, 1, activation=get_bn(zero_init=True))
+    return l + resnet_shortcut(shortcut, ch_out * 4, stride, activation=get_bn(zero_init=False))


 def resnet_group(l, name, block_func, features, count, stride):
@@ -97,9 +97,9 @@ def pretrained_resnet_conv4(image, num_blocks, freeze_c2=True):
    assert len(num_blocks) == 3
    with resnet_argscope():
        l = tf.pad(image, [[0, 0], [0, 0], [2, 3], [2, 3]])
-        l = Conv2D('conv0', l, 64, 7, stride=2, nl=BNReLU, padding='VALID')
+        l = Conv2D('conv0', l, 64, 7, strides=2, activation=BNReLU, padding='VALID')
        l = tf.pad(l, [[0, 0], [0, 0], [0, 1], [0, 1]])
-        l = MaxPooling('pool0', l, shape=3, stride=2, padding='VALID')
+        l = MaxPooling('pool0', l, 3, strides=2, padding='VALID')
        c2 = resnet_group(l, 'group0', resnet_bottleneck, 64, num_blocks[0], 1)
        # TODO replace var by const to enable optimization
        if freeze_c2:

--- a/examples/FasterRCNN/model.py
+++ b/examples/FasterRCNN/model.py
@@ -8,7 +8,7 @@ from tensorpack.tfutils.summary import add_moving_summary
 from tensorpack.tfutils.argscope import argscope
 from tensorpack.tfutils.scope_utils import under_name_scope
 from tensorpack.models import (
-    Conv2D, FullyConnected, GlobalAvgPooling, layer_register, Deconv2D)
+    Conv2D, FullyConnected, GlobalAvgPooling, layer_register, Conv2DTranspose)

 from utils.box_ops import pairwise_iou
 import config
@@ -34,9 +34,9 @@ def rpn_head(featuremap, channel, num_anchors):
        label_logits: fHxfWxNA
        box_logits: fHxfWxNAx4
    """
-    with argscope(Conv2D, data_format='NCHW',
-                  W_init=tf.random_normal_initializer(stddev=0.01)):
-        hidden = Conv2D('conv0', featuremap, channel, 3, nl=tf.nn.relu)
+    with argscope(Conv2D, data_format='channels_first',
+                  kernel_initializer=tf.random_normal_initializer(stddev=0.01)):
+        hidden = Conv2D('conv0', featuremap, channel, 3, activation=tf.nn.relu)

        label_logits = Conv2D('class', hidden, num_anchors, 1)
        box_logits = Conv2D('box', hidden, 4 * num_anchors, 1)
@@ -384,13 +384,13 @@ def fastrcnn_head(feature, num_classes):
    Returns:
        cls_logits (Nxnum_class), reg_logits (Nx num_class-1 x 4)
    """
-    feature = GlobalAvgPooling('gap', feature, data_format='NCHW')
+    feature = GlobalAvgPooling('gap', feature, data_format='channels_first')
    classification = FullyConnected(
        'class', feature, num_classes,
-        W_init=tf.random_normal_initializer(stddev=0.01))
+        kernel_initializer=tf.random_normal_initializer(stddev=0.01))
    box_regression = FullyConnected(
        'box', feature, (num_classes - 1) * 4,
-        W_init=tf.random_normal_initializer(stddev=0.001))
+        kernel_initializer=tf.random_normal_initializer(stddev=0.001))
    box_regression = tf.reshape(box_regression, (-1, num_classes - 1, 4))
    return classification, box_regression

@@ -501,11 +501,11 @@ def maskrcnn_head(feature, num_class):
    Returns:
        mask_logits (N x num_category x 14 x 14):
    """
-    with argscope([Conv2D, Deconv2D], data_format='NCHW',
-                  W_init=tf.variance_scaling_initializer(
+    with argscope([Conv2D, Conv2DTranspose], data_format='channels_first',
+                  kernel_initializer=tf.variance_scaling_initializer(
                      scale=2.0, mode='fan_out', distribution='normal')):
        # c2's MSRAFill is fan_out
-        l = Deconv2D('deconv', feature, 256, 2, stride=2, nl=tf.nn.relu)
+        l = Conv2DTranspose('deconv', feature, 256, 2, strides=2, activation=tf.nn.relu)
        l = Conv2D('conv', l, num_class - 1, 1)
    return l


--- a/examples/ImageNetModels/shufflenet.py
+++ b/examples/ImageNetModels/shufflenet.py
@@ -10,12 +10,9 @@ import cv2
 import tensorflow as tf


-from tensorpack import logger, QueueInput, InputDesc, PlaceholderInput, TowerContext
-from tensorpack.models import *
-from tensorpack.callbacks import *
-from tensorpack.train import *
+from tensorpack import *
 from tensorpack.dataflow import imgaug
-from tensorpack.tfutils import argscope, get_model_loader
+from tensorpack.tfutils import argscope, get_model_loader, model_utils
 from tensorpack.tfutils.scope_utils import under_name_scope
 from tensorpack.utils.gpu import get_nr_gpu

@@ -55,7 +52,7 @@ def channel_shuffle(l, group):
    return l


-def BN(x, name):
+def BN(x, name=None):
    return BatchNorm('bn', x)


@@ -206,6 +203,7 @@ if __name__ == '__main__':
        input.setup(input_desc)
        with TowerContext('', is_training=True):
            model.build_graph(*input.get_input_tensors())
+        model_utils.describe_trainable_vars()

        tf.profiler.profile(
            tf.get_default_graph(),

--- a/examples/PennTreebank/PTB-LSTM.py
+++ b/examples/PennTreebank/PTB-LSTM.py
@@ -74,7 +74,7 @@ class Model(ModelDesc):

        embeddingW = tf.get_variable('embedding', [VOCAB_SIZE, HIDDEN_SIZE], initializer=initializer)
        input_feature = tf.nn.embedding_lookup(embeddingW, input)  # B x seqlen x hiddensize
-        input_feature = Dropout(input_feature, DROPOUT)
+        input_feature = Dropout(input_feature, rate=DROPOUT)

        with tf.variable_scope('LSTM', initializer=initializer):
            input_list = tf.unstack(input_feature, num=SEQ_LEN, axis=1)  # seqlen x (Bxhidden)
@@ -89,7 +89,9 @@ class Model(ModelDesc):

        # seqlen x (Bxrnnsize)
        output = tf.reshape(tf.concat(outputs, 1), [-1, HIDDEN_SIZE])  # (Bxseqlen) x hidden
-        logits = FullyConnected('fc', output, VOCAB_SIZE, nl=tf.identity, W_init=initializer, b_init=initializer)
+        logits = FullyConnected('fc', output, VOCAB_SIZE,
+                                activation=tf.identity, kernel_initializer=initializer,
+                                bias_initializer=initializer)
        xent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=tf.reshape(nextinput, [-1]))


--- a/examples/ResNet/cifar10-preact18-mixup.py
+++ b/examples/ResNet/cifar10-preact18-mixup.py
@@ -27,14 +27,13 @@ def preactivation_block(input, num_filters, stride=1):

    # residual
    net = BNReLU(input)
-    residual = Conv2D('conv1', net, num_filters, kernel_shape=3, stride=stride, use_bias=False, nl=BNReLU)
-    residual = Conv2D('conv2', residual, num_filters, kernel_shape=3, stride=1, use_bias=False, nl=tf.identity)
+    residual = Conv2D('conv1', net, num_filters, kernel_size=3, strides=stride, use_bias=False, activation=BNReLU)
+    residual = Conv2D('conv2', residual, num_filters, kernel_size=3, strides=1, use_bias=False)

    # identity
    shortcut = input
    if stride != 1 or num_filters_in != num_filters:
-        shortcut = Conv2D('shortcut', net, num_filters, kernel_shape=1, stride=stride, use_bias=False,
-                          nl=tf.identity)
+        shortcut = Conv2D('shortcut', net, num_filters, kernel_size=1, strides=stride, use_bias=False)

    return shortcut + residual

@@ -54,17 +53,17 @@ class ResNet_Cifar(ModelDesc):
        image = tf.transpose(image, [0, 3, 1, 2])

        pytorch_default_init = tf.variance_scaling_initializer(scale=1.0 / 3, mode='fan_in', distribution='uniform')
-        with argscope([Conv2D, BatchNorm, GlobalAvgPooling], data_format='NCHW'), \
-                argscope(Conv2D, W_init=pytorch_default_init):
-            net = Conv2D('conv0', image, 64, kernel_shape=3, stride=1, use_bias=False)
+        with argscope([Conv2D, BatchNorm, GlobalAvgPooling], data_format='channels_first'), \
+                argscope(Conv2D, kernel_initializer=pytorch_default_init):
+            net = Conv2D('conv0', image, 64, kernel_size=3, strides=1, use_bias=False)
            for i, blocks_in_module in enumerate(MODULE_SIZES):
                for j in range(blocks_in_module):
                    stride = 2 if j == 0 and i > 0 else 1
                    with tf.variable_scope("res%d.%d" % (i, j)):
                        net = preactivation_block(net, FILTER_SIZES[i], stride)
            net = GlobalAvgPooling('gap', net)
-            logits = FullyConnected('linear', net, out_dim=CLASS_NUM,
-                                    nl=tf.identity, W_init=tf.random_normal_initializer(stddev=1e-3))
+            logits = FullyConnected('linear', net, CLASS_NUM,
+                                    kernel_initializer=tf.random_normal_initializer(stddev=1e-3))

        ce_cost = tf.nn.softmax_cross_entropy_with_logits(labels=label, logits=logits)
        ce_cost = tf.reduce_mean(ce_cost, name='cross_entropy_loss')

--- a/examples/ResNet/cifar10-resnet.py
+++ b/examples/ResNet/cifar10-resnet.py
@@ -63,7 +63,7 @@ class Model(ModelDesc):

            with tf.variable_scope(name):
                b1 = l if first else BNReLU(l)
-                c1 = Conv2D('conv1', b1, out_channel, stride=stride1, nl=BNReLU)
+                c1 = Conv2D('conv1', b1, out_channel, strides=stride1, activation=BNReLU)
                c2 = Conv2D('conv2', c1, out_channel)
                if increase_dim:
                    l = AvgPooling('pool', l, 2)
@@ -72,10 +72,10 @@ class Model(ModelDesc):
                l = c2 + l
                return l

-        with argscope([Conv2D, AvgPooling, BatchNorm, GlobalAvgPooling], data_format='NCHW'), \
-                argscope(Conv2D, nl=tf.identity, use_bias=False, kernel_shape=3,
-                         W_init=tf.variance_scaling_initializer(scale=2.0, mode='fan_out')):
-            l = Conv2D('conv0', image, 16, nl=BNReLU)
+        with argscope([Conv2D, AvgPooling, BatchNorm, GlobalAvgPooling], data_format='channels_first'), \
+                argscope(Conv2D, use_bias=False, kernel_size=3,
+                         kernel_initializer=tf.variance_scaling_initializer(scale=2.0, mode='fan_out')):
+            l = Conv2D('conv0', image, 16, activation=BNReLU)
            l = residual('res1.0', l, first=True)
            for k in range(1, self.n):
                l = residual('res1.{}'.format(k), l)
@@ -93,7 +93,7 @@ class Model(ModelDesc):
            # 8,c=64
            l = GlobalAvgPooling('gap', l)

-        logits = FullyConnected('linear', l, out_dim=10, nl=tf.identity)
+        logits = FullyConnected('linear', l, 10)
        tf.nn.softmax(logits, name='output')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)

--- a/examples/ResNet/load-resnet.py
+++ b/examples/ResNet/load-resnet.py
@@ -43,17 +43,17 @@ class Model(ModelDesc):
        image = tf.pad(image, [[0, 0], [3, 3], [3, 3], [0, 0]])
        image = tf.transpose(image, [0, 3, 1, 2])
        with argscope([Conv2D, MaxPooling, GlobalAvgPooling, BatchNorm],
-                      data_format='NCHW'), \
-                argscope(Conv2D, nl=tf.identity, use_bias=False):
+                      data_format='channels_first'), \
+                argscope(Conv2D, use_bias=False):
            logits = (LinearWrap(image)
-                      .Conv2D('conv0', 64, 7, stride=2, nl=BNReLU, padding='VALID')
-                      .MaxPooling('pool0', shape=3, stride=2, padding='SAME')
+                      .Conv2D('conv0', 64, 7, strides=2, activation=BNReLU, padding='VALID')
+                      .MaxPooling('pool0', 3, strides=2, padding='SAME')
                      .apply(resnet_group, 'group0', bottleneck, 64, blocks[0], 1)
                      .apply(resnet_group, 'group1', bottleneck, 128, blocks[1], 2)
                      .apply(resnet_group, 'group2', bottleneck, 256, blocks[2], 2)
                      .apply(resnet_group, 'group3', bottleneck, 512, blocks[3], 2)
                      .GlobalAvgPooling('gap')
-                      .FullyConnected('linear', 1000, nl=tf.identity)())
+                      .FullyConnected('linear', 1000)())
        tf.nn.softmax(logits, name='prob')
        ImageNetModel.compute_loss_and_error(logits, label)


--- a/examples/ResNet/resnet_model.py
+++ b/examples/ResNet/resnet_model.py
@@ -11,11 +11,11 @@ from tensorpack.models import (
    LinearWrap)


-def resnet_shortcut(l, n_out, stride, nl=tf.identity):
+def resnet_shortcut(l, n_out, stride, activation=tf.identity):
    data_format = get_arg_scope()['Conv2D']['data_format']
-    n_in = l.get_shape().as_list()[1 if data_format == 'NCHW' else 3]
+    n_in = l.get_shape().as_list()[1 if data_format in ['NCHW', 'channels_first'] else 3]
    if n_in != n_out:   # change dimension when channel is not the same
-        return Conv2D('convshortcut', l, n_out, 1, stride=stride, nl=nl)
+        return Conv2D('convshortcut', l, n_out, 1, strides=stride, activation=activation)
    else:
        return l

@@ -34,14 +34,14 @@ def get_bn(zero_init=False):
    Zero init gamma is good for resnet. See https://arxiv.org/abs/1706.02677.
    """
    if zero_init:
-        return lambda x, name: BatchNorm('bn', x, gamma_init=tf.zeros_initializer())
+        return lambda x, name=None: BatchNorm('bn', x, gamma_initializer=tf.zeros_initializer())
    else:
-        return lambda x, name: BatchNorm('bn', x)
+        return lambda x, name=None: BatchNorm('bn', x)


 def preresnet_basicblock(l, ch_out, stride, preact):
    l, shortcut = apply_preactivation(l, preact)
-    l = Conv2D('conv1', l, ch_out, 3, stride=stride, nl=BNReLU)
+    l = Conv2D('conv1', l, ch_out, 3, strides=stride, activation=BNReLU)
    l = Conv2D('conv2', l, ch_out, 3)
    return l + resnet_shortcut(shortcut, ch_out, stride)

@@ -49,8 +49,8 @@ def preresnet_basicblock(l, ch_out, stride, preact):
 def preresnet_bottleneck(l, ch_out, stride, preact):
    # stride is applied on the second conv, following fb.resnet.torch
    l, shortcut = apply_preactivation(l, preact)
-    l = Conv2D('conv1', l, ch_out, 1, nl=BNReLU)
-    l = Conv2D('conv2', l, ch_out, 3, stride=stride, nl=BNReLU)
+    l = Conv2D('conv1', l, ch_out, 1, activation=BNReLU)
+    l = Conv2D('conv2', l, ch_out, 3, strides=stride, activation=BNReLU)
    l = Conv2D('conv3', l, ch_out * 4, 1)
    return l + resnet_shortcut(shortcut, ch_out * 4, stride)

@@ -70,9 +70,9 @@ def preresnet_group(l, name, block_func, features, count, stride):

 def resnet_basicblock(l, ch_out, stride):
    shortcut = l
-    l = Conv2D('conv1', l, ch_out, 3, stride=stride, nl=BNReLU)
-    l = Conv2D('conv2', l, ch_out, 3, nl=get_bn(zero_init=True))
-    return l + resnet_shortcut(shortcut, ch_out, stride, nl=get_bn(zero_init=False))
+    l = Conv2D('conv1', l, ch_out, 3, strides=stride, activation=BNReLU)
+    l = Conv2D('conv2', l, ch_out, 3, activation=get_bn(zero_init=True))
+    return l + resnet_shortcut(shortcut, ch_out, stride, activation=get_bn(zero_init=False))


 def resnet_bottleneck(l, ch_out, stride, stride_first=False):
@@ -80,27 +80,27 @@ def resnet_bottleneck(l, ch_out, stride, stride_first=False):
    stride_first: original resnet put stride on first conv. fb.resnet.torch put stride on second conv.
    """
    shortcut = l
-    l = Conv2D('conv1', l, ch_out, 1, stride=stride if stride_first else 1, nl=BNReLU)
-    l = Conv2D('conv2', l, ch_out, 3, stride=1 if stride_first else stride, nl=BNReLU)
-    l = Conv2D('conv3', l, ch_out * 4, 1, nl=get_bn(zero_init=True))
-    return l + resnet_shortcut(shortcut, ch_out * 4, stride, nl=get_bn(zero_init=False))
+    l = Conv2D('conv1', l, ch_out, 1, strides=stride if stride_first else 1, activation=BNReLU)
+    l = Conv2D('conv2', l, ch_out, 3, strides=1 if stride_first else stride, activation=BNReLU)
+    l = Conv2D('conv3', l, ch_out * 4, 1, activation=get_bn(zero_init=True))
+    return l + resnet_shortcut(shortcut, ch_out * 4, stride, activation=get_bn(zero_init=False))


 def se_resnet_bottleneck(l, ch_out, stride):
    shortcut = l
-    l = Conv2D('conv1', l, ch_out, 1, nl=BNReLU)
-    l = Conv2D('conv2', l, ch_out, 3, stride=stride, nl=BNReLU)
-    l = Conv2D('conv3', l, ch_out * 4, 1, nl=get_bn(zero_init=True))
+    l = Conv2D('conv1', l, ch_out, 1, activation=BNReLU)
+    l = Conv2D('conv2', l, ch_out, 3, strides=stride, activation=BNReLU)
+    l = Conv2D('conv3', l, ch_out * 4, 1, activation=get_bn(zero_init=True))

    squeeze = GlobalAvgPooling('gap', l)
-    squeeze = FullyConnected('fc1', squeeze, ch_out // 4, nl=tf.nn.relu)
-    squeeze = FullyConnected('fc2', squeeze, ch_out * 4, nl=tf.nn.sigmoid)
+    squeeze = FullyConnected('fc1', squeeze, ch_out // 4, activation=tf.nn.relu)
+    squeeze = FullyConnected('fc2', squeeze, ch_out * 4, activation=tf.nn.sigmoid)
    data_format = get_arg_scope()['Conv2D']['data_format']
-    ch_ax = 1 if data_format == 'NCHW' else 3
+    ch_ax = 1 if data_format in ['NCHW', 'channels_first'] else 3
    shape = [-1, 1, 1, 1]
    shape[ch_ax] = ch_out * 4
    l = l * tf.reshape(squeeze, shape)
-    return l + resnet_shortcut(shortcut, ch_out * 4, stride, nl=get_bn(zero_init=False))
+    return l + resnet_shortcut(shortcut, ch_out * 4, stride, activation=get_bn(zero_init=False))


 def resnet_group(l, name, block_func, features, count, stride):
@@ -114,15 +114,15 @@ def resnet_group(l, name, block_func, features, count, stride):


 def resnet_backbone(image, num_blocks, group_func, block_func):
-    with argscope(Conv2D, nl=tf.identity, use_bias=False,
-                  W_init=tf.variance_scaling_initializer(scale=2.0, mode='fan_out')):
+    with argscope(Conv2D, use_bias=False,
+                  kernel_initializer=tf.variance_scaling_initializer(scale=2.0, mode='fan_out')):
        logits = (LinearWrap(image)
-                  .Conv2D('conv0', 64, 7, stride=2, nl=BNReLU)
+                  .Conv2D('conv0', 64, 7, strides=2, activation=BNReLU)
                  .MaxPooling('pool0', shape=3, stride=2, padding='SAME')
                  .apply(group_func, 'group0', block_func, 64, num_blocks[0], 1)
                  .apply(group_func, 'group1', block_func, 128, num_blocks[1], 2)
                  .apply(group_func, 'group2', block_func, 256, num_blocks[2], 2)
                  .apply(group_func, 'group3', block_func, 512, num_blocks[3], 2)
                  .GlobalAvgPooling('gap')
-                  .FullyConnected('linear', 1000, nl=tf.identity)())
+                  .FullyConnected('linear', 1000)())
    return logits
--- a/examples/Saliency/CAM-resnet.py
+++ b/examples/Saliency/CAM-resnet.py
@@ -46,12 +46,12 @@ class Model(ModelDesc):
        }
        defs, block_func = cfg[DEPTH]

-        with argscope(Conv2D, nl=tf.identity, use_bias=False,
-                      W_init=tf.variance_scaling_initializer(scale=2.0, mode='fan_out')), \
-                argscope([Conv2D, MaxPooling, GlobalAvgPooling, BatchNorm], data_format='NCHW'):
+        with argscope(Conv2D, use_bias=False,
+                      kernel_initializer=tf.variance_scaling_initializer(scale=2.0, mode='fan_out')), \
+                argscope([Conv2D, MaxPooling, GlobalAvgPooling, BatchNorm], data_format='channels_first'):
            convmaps = (LinearWrap(image)
-                        .Conv2D('conv0', 64, 7, stride=2, nl=BNReLU)
-                        .MaxPooling('pool0', shape=3, stride=2, padding='SAME')
+                        .Conv2D('conv0', 64, 7, strides=2, activation=BNReLU)
+                        .MaxPooling('pool0', 3, strides=2, padding='SAME')
                        .apply(preresnet_group, 'group0', block_func, 64, defs[0], 1)
                        .apply(preresnet_group, 'group1', block_func, 128, defs[1], 2)
                        .apply(preresnet_group, 'group2', block_func, 256, defs[2], 2)
@@ -59,7 +59,7 @@ class Model(ModelDesc):
            print(convmaps)
            logits = (LinearWrap(convmaps)
                      .GlobalAvgPooling('gap')
-                      .FullyConnected('linearnew', 1000, nl=tf.identity)())
+                      .FullyConnected('linearnew', 1000)())

        loss = compute_loss_and_error(logits, label)
        wd_cost = regularize_cost('.*/W', l2_regularizer(1e-4), name='l2_regularize_loss')

--- a/examples/Saliency/imagenet_utils.py
+++ b/examples/Saliency/imagenet_utils.py
-../ResNet/imagenet_utils.py
\ No newline at end of file
+../ImageNetModels/imagenet_utils.py
\ No newline at end of file
--- a/examples/SpatialTransformer/mnist-addition.py
+++ b/examples/SpatialTransformer/mnist-addition.py
@@ -39,10 +39,10 @@ class Model(ModelDesc):
                   .Conv2D('conv0', 20, 5, padding='VALID')
                   .MaxPooling('pool0', 2)
                   .Conv2D('conv1', 20, 5, padding='VALID')
-                   .FullyConnected('fc1', out_dim=32)
-                   .FullyConnected('fct', out_dim=6, nl=tf.identity,
-                                   W_init=tf.constant_initializer(),
-                                   b_init=tf.constant_initializer([1, 0, HALF_DIFF, 0, 1, HALF_DIFF]))())
+                   .FullyConnected('fc1', 32)
+                   .FullyConnected('fct', 6, activation=tf.identity,
+                                   kernel_initializer=tf.constant_initializer(),
+                                   bias_initializer=tf.constant_initializer([1, 0, HALF_DIFF, 0, 1, HALF_DIFF]))())
            # output 6 parameters for affine transformation
            stn = tf.reshape(stn, [-1, 2, 3], name='affine')  # bx2x3
            stn = tf.reshape(tf.transpose(stn, [2, 0, 1]), [3, -1])  # 3 x (bx2)
@@ -52,7 +52,7 @@ class Model(ModelDesc):
            sampled = ImageSample('warp', [image, coor], borderMode='constant')
            return sampled

-        with argscope([Conv2D, FullyConnected], nl=tf.nn.relu):
+        with argscope([Conv2D, FullyConnected], activation=tf.nn.relu):
            with tf.variable_scope('STN1'):
                sampled1 = get_stn(image)
            with tf.variable_scope('STN2'):
@@ -71,9 +71,9 @@ class Model(ModelDesc):

        sampled = tf.concat([sampled1, sampled2], 3, 'sampled_concat')
        logits = (LinearWrap(sampled)
-                  .FullyConnected('fc1', out_dim=256, nl=tf.nn.relu)
-                  .FullyConnected('fc2', out_dim=128, nl=tf.nn.relu)
-                  .FullyConnected('fct', out_dim=19, nl=tf.identity)())
+                  .FullyConnected('fc1', 256, activation=tf.nn.relu)
+                  .FullyConnected('fc2', 128, activation=tf.nn.relu)
+                  .FullyConnected('fct', 19, activation=tf.identity)())
        tf.nn.softmax(logits, name='prob')

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)

--- a/examples/SuperResolution/enet-pat.py
+++ b/examples/SuperResolution/enet-pat.py
@@ -63,8 +63,8 @@ class Model(GANModelDesc):

        def resnet_block(x, name):
            with tf.variable_scope(name):
-                y = Conv2D('conv0', x, NF, nl=tf.nn.relu)
-                y = Conv2D('conv1', y, NF, nl=tf.identity)
+                y = Conv2D('conv0', x, NF, activation=tf.nn.relu)
+                y = Conv2D('conv1', y, NF, activation=tf.identity)
            return x + y

        def upsample(x, factor=2):
@@ -74,7 +74,7 @@ class Model(GANModelDesc):

        def generator(x, Ibicubic):
            x = x - VGG_MEAN_TENSOR / 255.0
-            with argscope(Conv2D, kernel_shape=3, stride=1, nl=tf.nn.relu):
+            with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu):
                x = Conv2D('conv1', x, NF)
                for i in range(10):
                    x = resnet_block(x, 'block_%i' % i)
@@ -83,27 +83,27 @@ class Model(GANModelDesc):
                x = upsample(x)
                x = Conv2D('conv_post_2', x, NF)
                x = Conv2D('conv_post_3', x, NF)
-                Ires = Conv2D('conv_post_4', x, 3, nl=tf.identity)
+                Ires = Conv2D('conv_post_4', x, 3, activation=tf.identity)
                Iest = tf.add(Ibicubic, Ires, name='Iest')
                return Iest     # [0,1]

        @auto_reuse_variable_scope
        def discriminator(x):
            x = x - VGG_MEAN_TENSOR / 255.0
-            with argscope(Conv2D, kernel_shape=3, stride=1, nl=tf.nn.leaky_relu):
+            with argscope(Conv2D, kernel_size=3, activation=tf.nn.leaky_relu):
                x = Conv2D('conv0', x, 32)
-                x = Conv2D('conv0b', x, 32, stride=2)
+                x = Conv2D('conv0b', x, 32, strides=2)
                x = Conv2D('conv1', x, 64)
-                x = Conv2D('conv1b', x, 64, stride=2)
+                x = Conv2D('conv1b', x, 64, strides=2)
                x = Conv2D('conv2', x, 128)
-                x = Conv2D('conv2b', x, 128, stride=2)
+                x = Conv2D('conv2b', x, 128, strides=2)
                x = Conv2D('conv3', x, 256)
-                x = Conv2D('conv3b', x, 256, stride=2)
+                x = Conv2D('conv3b', x, 256, strides=2)
                x = Conv2D('conv4', x, 512)
-                x = Conv2D('conv4b', x, 512, stride=2)
+                x = Conv2D('conv4b', x, 512, strides=2)

-            x = FullyConnected('fc0', x, 1024, nl=tf.nn.leaky_relu)
-            x = FullyConnected('fc1', x, 1, nl=tf.identity)
+            x = FullyConnected('fc0', x, 1024, activation=tf.nn.leaky_relu)
+            x = FullyConnected('fc1', x, 1, activation=tf.identity)
            return x

        def additional_losses(a, b):
@@ -113,7 +113,7 @@ class Model(GANModelDesc):
                x = x - VGG_MEAN_TENSOR
                # VGG 19
                with varreplace.freeze_variables():
-                    with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu):
+                    with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu):
                        conv1_1 = Conv2D('conv1_1', x, 64)
                        conv1_2 = Conv2D('conv1_2', conv1_1, 64)
                        pool1 = MaxPooling('pool1', conv1_2, 2)  # 64

--- a/tensorpack/callbacks/saver.py
+++ b/tensorpack/callbacks/saver.py
@@ -98,18 +98,21 @@ class MinSaver(Callback):
            reverse (bool): if True, will save the maximum.
            filename (str): the name for the saved model.
                Defaults to ``min-{monitor_stat}.tfmodel``.
+
        Example:
            Save the model with minimum validation error to
            "min-val-error.tfmodel":
+
            .. code-block:: python
+
                MinSaver('val-error')
-        Note:
+
+        Notes:
            It assumes that :class:`ModelSaver` is used with
            the same ``checkpoint_dir``. And it will save
            the model to that directory as well.
            The default for both :class:`ModelSaver` and :class:`MinSaver`
            is ``checkpoint_dir=logger.get_logger_dir()``
-
        """
        self.monitor_stat = monitor_stat
        self.reverse = reverse

--- a/tensorpack/input_source/input_source.py
+++ b/tensorpack/input_source/input_source.py
@@ -517,8 +517,7 @@ class StagingInput(FeedfreeInput):
        Args:
            input (FeedfreeInput):
            nr_stage: number of elements to prefetch into each StagingArea, at the beginning.
-                Since enqueue and dequeue are synchronized, prefetching 1
-                    element should be sufficient.
+                Since enqueue and dequeue are synchronized, prefetching 1 element should be sufficient.
            towers: deprecated
            device (str or None): if not None, place the StagingArea on a specific device. e.g., '/cpu:0'.
                Otherwise, they are placed under where `get_inputs_tensors`

--- a/tensorpack/utils/logger.py
+++ b/tensorpack/utils/logger.py
@@ -80,11 +80,13 @@ def set_logger_dir(dirname, action=None):

    Args:
        dirname(str): log directory
-        action(str): an action of ("k","d","q") to be performed
+        action(str): an action of ["k","d","q"] to be performed
            when the directory exists. Will ask user by default.
-            "d": delete the directory. Note that the deletion may fail when
+
+                "d": delete the directory. Note that the deletion may fail when
                the directory is used by tensorboard.
-            "k": keep the directory. This is useful when you resume from a
+
+                "k": keep the directory. This is useful when you resume from a
                previous training and want the directory to look as if the
                training was not interrupted.
                Note that this option does not load old models or any other