Migrate some examples to use tf.layers argument name convention (#627)

4744853b · Yuxin Wu · 7abf4ace · 4744853b · 4744853b · 4744853b
Commit 4744853b authored Mar 12, 2018 by Yuxin Wu
20 changed files
--- a/docs/tutorial/symbolic.md
+++ b/docs/tutorial/symbolic.md
@@ -8,8 +8,8 @@ Using the tensorpack implementations, you can also benefit from `argscope` and `
 simplify the code.

 Note that these layers were written because there were no other alternatives back at that time.
-In the future we may shift the implementation to `tf.layers` because they will be better maintained.
-You can start using `tf.layers` today as long as it fits your need.
+Now, these layers actually call `tf.layers` directly.
+You can just use `tf.layers` as long as it fits your need.

 ### argscope and LinearWrap
 `argscope` gives you a context with default arguments.

--- a/examples/A3C-Gym/train-atari.py
+++ b/examples/A3C-Gym/train-atari.py
@@ -79,19 +79,19 @@ class Model(ModelDesc):

    def _get_NN_prediction(self, image):
        image = tf.cast(image, tf.float32) / 255.0
-        with argscope(Conv2D, nl=tf.nn.relu):
-            l = Conv2D('conv0', image, out_channel=32, kernel_shape=5)
+        with argscope(Conv2D, activation=tf.nn.relu):
+            l = Conv2D('conv0', image, 32, 5)
            l = MaxPooling('pool0', l, 2)
-            l = Conv2D('conv1', l, out_channel=32, kernel_shape=5)
+            l = Conv2D('conv1', l, 32, 5)
            l = MaxPooling('pool1', l, 2)
-            l = Conv2D('conv2', l, out_channel=64, kernel_shape=4)
+            l = Conv2D('conv2', l, 64, 4)
            l = MaxPooling('pool2', l, 2)
-            l = Conv2D('conv3', l, out_channel=64, kernel_shape=3)
+            l = Conv2D('conv3', l, 64, 3)

-        l = FullyConnected('fc0', l, 512, nl=tf.identity)
+        l = FullyConnected('fc0', l, 512)
        l = PReLU('prelu', l)
-        logits = FullyConnected('fc-pi', l, out_dim=NUM_ACTIONS, nl=tf.identity)    # unnormalized policy
-        value = FullyConnected('fc-v', l, 1, nl=tf.identity)
+        logits = FullyConnected('fc-pi', l, NUM_ACTIONS)    # unnormalized policy
+        value = FullyConnected('fc-v', l, 1)
        return logits, value

    def _build_graph(self, inputs):

--- a/examples/CTC-TIMIT/train-timit.py
+++ b/examples/CTC-TIMIT/train-timit.py
@@ -47,8 +47,8 @@ class Model(ModelDesc):

        # o: b x t x HIDDEN
        output = tf.reshape(outputs, [-1, HIDDEN])  # (Bxt) x rnnsize
-        logits = FullyConnected('fc', output, NR_CLASS, nl=tf.identity,
-                                W_init=tf.truncated_normal_initializer(stddev=0.01))
+        logits = FullyConnected('fc', output, NR_CLASS, activation=tf.identity,
+                                kernel_initializer=tf.truncated_normal_initializer(stddev=0.01))
        logits = tf.reshape(logits, (BATCH, -1, NR_CLASS))

        loss = tf.nn.ctc_loss(label, logits, seqlen, time_major=False)

--- a/examples/CaffeModels/load-alexnet.py
+++ b/examples/CaffeModels/load-alexnet.py
@@ -18,24 +18,24 @@ import tensorflow as tf

 def tower_func(image):
    # img: 227x227x3
-    with argscope([Conv2D, FullyConnected], nl=tf.nn.relu):
-        l = Conv2D('conv1', image, out_channel=96, kernel_shape=11, stride=4, padding='VALID')
+    with argscope([Conv2D, FullyConnected], activation=tf.nn.relu):
+        l = Conv2D('conv1', image, filters=96, kernel_size=11, strides=4, padding='VALID')
        l = tf.nn.lrn(l, 2, bias=1.0, alpha=2e-5, beta=0.75, name='norm1')
-        l = MaxPooling('pool1', l, 3, stride=2, padding='VALID')
+        l = MaxPooling('pool1', l, 3, strides=2, padding='VALID')

-        l = Conv2D('conv2', l, out_channel=256, kernel_shape=5, split=2)
+        l = Conv2D('conv2', l, filters=256, kernel_size=5, split=2)
        l = tf.nn.lrn(l, 2, bias=1.0, alpha=2e-5, beta=0.75, name='norm2')
-        l = MaxPooling('pool2', l, 3, stride=2, padding='VALID')
+        l = MaxPooling('pool2', l, 3, strides=2, padding='VALID')

-        l = Conv2D('conv3', l, out_channel=384, kernel_shape=3)
-        l = Conv2D('conv4', l, out_channel=384, kernel_shape=3, split=2)
-        l = Conv2D('conv5', l, out_channel=256, kernel_shape=3, split=2)
-        l = MaxPooling('pool3', l, 3, stride=2, padding='VALID')
+        l = Conv2D('conv3', l, filters=384, kernel_size=3)
+        l = Conv2D('conv4', l, filters=384, kernel_size=3, split=2)
+        l = Conv2D('conv5', l, filters=256, kernel_size=3, split=2)
+        l = MaxPooling('pool3', l, 3, strides=2, padding='VALID')

        # This is just a script to load model, so we ignore the dropout layer
        l = FullyConnected('fc6', l, 4096)
-        l = FullyConnected('fc7', l, out_dim=4096)
-    logits = FullyConnected('fc8', l, out_dim=1000, nl=tf.identity)
+        l = FullyConnected('fc7', l, 4096)
+    logits = FullyConnected('fc8', l, 1000)
    tf.nn.softmax(logits, name='prob')



--- a/examples/CaffeModels/load-cpm.py
+++ b/examples/CaffeModels/load-cpm.py
@@ -48,9 +48,8 @@ def CPM(image):

    gmap = tf.constant(get_gaussian_map())
    gmap = tf.pad(gmap, [[0, 0], [0, 1], [0, 1], [0, 0]])
-    pool_center = AvgPooling('mappool', gmap, 9, stride=8, padding='VALID')
-    with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu,
-                  W_init=tf.random_normal_initializer(stddev=0.01)):
+    pool_center = AvgPooling('mappool', gmap, 9, strides=8, padding='VALID')
+    with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu):
        shared = (LinearWrap(image)
                  .Conv2D('conv1_1', 64)
                  .Conv2D('conv1_2', 64)
@@ -78,16 +77,14 @@ def CPM(image):
        l = tf.concat([l, shared, pool_center], 3,
                      name='concat_stage{}'.format(stage))
        for i in range(1, 6):
-            l = Conv2D('Mconv{}_stage{}'.format(i, stage), l, 128)
-        l = Conv2D('Mconv6_stage{}'.format(stage), l, 128, kernel_shape=1)
-        l = Conv2D('Mconv7_stage{}'.format(stage),
-                   l, 15, kernel_shape=1, nl=tf.identity)
+            l = Conv2D('Mconv{}_stage{}'.format(i, stage), l, 128, 7, activation=tf.nn.relu)
+        l = Conv2D('Mconv6_stage{}'.format(stage), l, 128, 1, activation=tf.nn.relu)
+        l = Conv2D('Mconv7_stage{}'.format(stage), l, 15, 1, activation=tf.identity)
        return l

-    with argscope(Conv2D, kernel_shape=7, nl=tf.nn.relu):
    out1 = (LinearWrap(shared)
-                .Conv2D('conv5_1_CPM', 512, kernel_shape=1)
-                .Conv2D('conv5_2_CPM', 15, kernel_shape=1, nl=tf.identity)())
+            .Conv2D('conv5_1_CPM', 512, 1, activation=tf.nn.relu)
+            .Conv2D('conv5_2_CPM', 15, 1, activation=tf.identity)())
    out2 = add_stage(2, out1)
    out3 = add_stage(3, out2)
    out4 = add_stage(4, out3)

--- a/examples/CaffeModels/load-vgg16.py
+++ b/examples/CaffeModels/load-vgg16.py
@@ -17,7 +17,7 @@ from tensorpack.dataflow.dataset import ILSVRCMeta


 def tower_func(image):
-    with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu):
+    with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu):
        logits = (LinearWrap(image)
                  .Conv2D('conv1_1', 64)
                  .Conv2D('conv1_2', 64)
@@ -42,11 +42,11 @@ def tower_func(image):
                  .Conv2D('conv5_3', 512)
                  .MaxPooling('pool5', 2)
                  # 7
-                  .FullyConnected('fc6', 4096, nl=tf.nn.relu)
+                  .FullyConnected('fc6', 4096, activation=tf.nn.relu)
                  .Dropout('drop0', 0.5)
-                  .FullyConnected('fc7', 4096, nl=tf.nn.relu)
+                  .FullyConnected('fc7', 4096, activation=tf.nn.relu)
                  .Dropout('drop1', 0.5)
-                  .FullyConnected('fc8', out_dim=1000, nl=tf.identity)())
+                  .FullyConnected('fc8', 1000)())
    tf.nn.softmax(logits, name='prob')



--- a/examples/CaffeModels/load-vgg19.py
+++ b/examples/CaffeModels/load-vgg19.py
@@ -16,7 +16,7 @@ from tensorpack.dataflow.dataset import ILSVRCMeta


 def tower_func(image):
-    with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu):
+    with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu):
        logits = (LinearWrap(image)
                  .Conv2D('conv1_1', 64)
                  .Conv2D('conv1_2', 64)
@@ -44,11 +44,11 @@ def tower_func(image):
                  .Conv2D('conv5_4', 512)
                  .MaxPooling('pool5', 2)
                  # 7
-                  .FullyConnected('fc6', 4096, nl=tf.nn.relu)
+                  .FullyConnected('fc6', 4096, activation=tf.nn.relu)
                  .Dropout('drop0', 0.5)
-                  .FullyConnected('fc7', 4096, nl=tf.nn.relu)
+                  .FullyConnected('fc7', 4096, activation=tf.nn.relu)
                  .Dropout('drop1', 0.5)
-                  .FullyConnected('fc8', out_dim=1000, nl=tf.identity)())
+                  .FullyConnected('fc8', 1000)())
    tf.nn.softmax(logits, name='prob')



--- a/examples/Char-RNN/char-rnn.py
+++ b/examples/Char-RNN/char-rnn.py
@@ -99,7 +99,7 @@ class Model(ModelDesc):

        # seqlen x (Bxrnnsize)
        output = tf.reshape(tf.concat(outputs, 1), [-1, param.rnn_size])  # (Bxseqlen) x rnnsize
-        logits = FullyConnected('fc', output, param.vocab_size, nl=tf.identity)
+        logits = FullyConnected('fc', output, param.vocab_size, activation=tf.identity)
        tf.nn.softmax(logits / param.softmax_temprature, name='prob')

        xent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(

--- a/examples/DeepQNetwork/DQN.py
+++ b/examples/DeepQNetwork/DQN.py
@@ -57,9 +57,9 @@ class Model(DQNModel):
        with argscope(Conv2D, activation=lambda x: PReLU('prelu', x), use_bias=True):
            l = (LinearWrap(image)
                 # Nature architecture
-                 .Conv2D('conv0', out_channel=32, kernel_shape=8, stride=4)
-                 .Conv2D('conv1', out_channel=64, kernel_shape=4, stride=2)
-                 .Conv2D('conv2', out_channel=64, kernel_shape=3)
+                 .Conv2D('conv0', 32, 8, strides=4)
+                 .Conv2D('conv1', 64, 4, strides=2)
+                 .Conv2D('conv2', 64, 3)

                 # architecture used for the figure in the README, slower but takes fewer iterations to converge
                 # .Conv2D('conv0', out_channel=32, kernel_shape=5)
@@ -73,11 +73,11 @@ class Model(DQNModel):
                 .FullyConnected('fc0', 512)
                 .tf.nn.leaky_relu(alpha=0.01)())
        if self.method != 'Dueling':
-            Q = FullyConnected('fct', l, self.num_actions, nl=tf.identity)
+            Q = FullyConnected('fct', l, self.num_actions)
        else:
            # Dueling DQN
-            V = FullyConnected('fctV', l, 1, nl=tf.identity)
-            As = FullyConnected('fctA', l, self.num_actions, nl=tf.identity)
+            V = FullyConnected('fctV', l, 1)
+            As = FullyConnected('fctA', l, self.num_actions)
            Q = tf.add(As, V - tf.reduce_mean(As, 1, keep_dims=True))
        return tf.identity(Q, name='Qvalue')


--- a/examples/DisturbLabel/mnist-disturb.py
+++ b/examples/DisturbLabel/mnist-disturb.py
@@ -33,14 +33,13 @@ class Model(mnist_example.Model):
        image, label = inputs
        image = tf.expand_dims(image, 3)

-        with argscope(Conv2D, kernel_shape=5, nl=tf.nn.relu):
-            logits = (LinearWrap(image)  # the starting brace is only for line-breaking
-                      .Conv2D('conv0', out_channel=32, padding='VALID')
+        logits = (LinearWrap(image)  # the starting brace is oactivationy for line-breaking
+                  .Conv2D('conv0', 32, 5, padding='VALID', activation=tf.nn.relu)
                  .MaxPooling('pool0', 2)
-                      .Conv2D('conv1', out_channel=64, padding='VALID')
+                  .Conv2D('conv1', 64, 5, padding='VALID', activation=tf.nn.relu)
                  .MaxPooling('pool1', 2)
-                      .FullyConnected('fc0', 512, nl=tf.nn.relu)
-                      .FullyConnected('fc1', out_dim=10, nl=tf.identity)())
+                  .FullyConnected('fc0', 512, activation=tf.nn.relu)
+                  .FullyConnected('fc1', out_dim=10, activation=tf.identity)())
        tf.nn.softmax(logits, name='prob')

        wrong = symbolic_functions.prediction_incorrect(logits, label)

--- a/examples/DoReFa-Net/alexnet-dorefa.py
+++ b/examples/DoReFa-Net/alexnet-dorefa.py
@@ -106,10 +106,10 @@ class Model(ModelDesc):
            return fa(nonlin(x))

        with remap_variables(new_get_variable), \
-                argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
-                argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity):
+                argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \
+                argscope(Conv2D, use_bias=False):
            logits = (LinearWrap(image)
-                      .Conv2D('conv0', 96, 12, stride=4, padding='VALID')
+                      .Conv2D('conv0', 96, 12, strides=4, padding='VALID')
                      .apply(activate)
                      .Conv2D('conv1', 256, 5, padding='SAME', split=2)
                      .apply(fg)
@@ -139,7 +139,7 @@ class Model(ModelDesc):
                      .BatchNorm('bnfc0')
                      .apply(activate)

-                      .FullyConnected('fc1', 4096)
+                      .FullyConnected('fc1', 4096, use_bias=False)
                      .apply(fg)
                      .BatchNorm('bnfc1')
                      .apply(nonlin)

--- a/examples/DoReFa-Net/svhn-digit-dorefa.py
+++ b/examples/DoReFa-Net/svhn-digit-dorefa.py
@@ -72,8 +72,8 @@ class Model(ModelDesc):
        image = image / 256.0

        with remap_variables(binarize_weight), \
-                argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
-                argscope(Conv2D, use_bias=False, nl=tf.identity):
+                argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \
+                argscope(Conv2D, use_bias=False):
            logits = (LinearWrap(image)
                      .Conv2D('conv0', 48, 5, padding='VALID', use_bias=True)
                      .MaxPooling('pool0', 2, padding='SAME')
@@ -106,7 +106,7 @@ class Model(ModelDesc):
                      .Conv2D('conv6', 512, 5, padding='VALID')
                      .apply(fg).BatchNorm('bn6')
                      .apply(cabs)
-                      .FullyConnected('fc1', 10, nl=tf.identity)())
+                      .FullyConnected('fc1', 10)())
        tf.nn.softmax(logits, name='output')

        # compute the number of failed samples

--- a/examples/HED/hed.py
+++ b/examples/HED/hed.py
@@ -55,16 +55,15 @@ class Model(ModelDesc):

        def branch(name, l, up):
            with tf.variable_scope(name):
-                l = Conv2D('convfc', l, 1, kernel_shape=1, nl=tf.identity,
+                l = Conv2D('convfc', l, 1, kernel_size=1, activation=tf.identity,
                           use_bias=True,
-                           W_init=tf.constant_initializer(),
-                           b_init=tf.constant_initializer())
+                           kernel_initializer=tf.constant_initializer())
                while up != 1:
                    l = BilinearUpSample('upsample{}'.format(up), l, 2)
                    up = up / 2
                return l

-        with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu):
+        with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu):
            l = Conv2D('conv1_1', image, 64)
            l = Conv2D('conv1_2', l, 64)
            b1 = branch('branch1', l, 1)
@@ -93,9 +92,9 @@ class Model(ModelDesc):
            b5 = branch('branch5', l, 16)

        final_map = Conv2D('convfcweight',
-                           tf.concat([b1, b2, b3, b4, b5], 3), 1, 1,
-                           W_init=tf.constant_initializer(0.2),
-                           use_bias=False, nl=tf.identity)
+                           tf.concat([b1, b2, b3, b4, b5], 3), 1, kernel_size=1,
+                           kernel_initializer=tf.constant_initializer(0.2),
+                           use_bias=False, activation=tf.identity)
        costs = []
        for idx, b in enumerate([b1, b2, b3, b4, b5, final_map]):
            output = tf.nn.sigmoid(b, name='output{}'.format(idx + 1))

--- a/examples/ImageNetModels/inception-bn.py
+++ b/examples/ImageNetModels/inception-bn.py
@@ -39,11 +39,11 @@ class Model(ModelDesc):
                if nr1x1 != 0:
                    outs.append(Conv2D('conv1x1', x, nr1x1, 1))
                x2 = Conv2D('conv3x3r', x, nr3x3r, 1)
-                outs.append(Conv2D('conv3x3', x2, nr3x3, 3, stride=stride))
+                outs.append(Conv2D('conv3x3', x2, nr3x3, 3, strides=stride))

                x3 = Conv2D('conv233r', x, nr233r, 1)
                x3 = Conv2D('conv233a', x3, nr233, 3)
-                outs.append(Conv2D('conv233b', x3, nr233, 3, stride=stride))
+                outs.append(Conv2D('conv233b', x3, nr233, 3, strides=stride))

                if pooltype == 'max':
                    x4 = MaxPooling('mpool', x, 3, stride, padding='SAME')
@@ -55,9 +55,9 @@ class Model(ModelDesc):
                outs.append(x4)
                return tf.concat(outs, 3, name='concat')

-        with argscope(Conv2D, nl=BNReLU, use_bias=False):
+        with argscope(Conv2D, activation=BNReLU, use_bias=False):
            l = (LinearWrap(image)
-                 .Conv2D('conv0', 64, 7, stride=2)
+                 .Conv2D('conv0', 64, 7, strides=2)
                 .MaxPooling('pool0', 3, 2, padding='SAME')
                 .Conv2D('conv1', 64, 1)
                 .Conv2D('conv2', 192, 3)
@@ -69,8 +69,8 @@ class Model(ModelDesc):

            br1 = (LinearWrap(l)
                   .Conv2D('loss1conv', 128, 1)
-                   .FullyConnected('loss1fc', 1024, nl=tf.nn.relu)
-                   .FullyConnected('loss1logit', 1000, nl=tf.identity)())
+                   .FullyConnected('loss1fc', 1024, activation=tf.nn.relu)
+                   .FullyConnected('loss1logit', 1000, activation=tf.identity)())
            loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=br1, labels=label)
            loss1 = tf.reduce_mean(loss1, name='loss1')

@@ -82,8 +82,8 @@ class Model(ModelDesc):
            l = inception('incep4e', l, 0, 128, 192, 192, 256, 0, 'max')

            br2 = Conv2D('loss2conv', l, 128, 1)
-            br2 = FullyConnected('loss2fc', br2, 1024, nl=tf.nn.relu)
-            br2 = FullyConnected('loss2logit', br2, 1000, nl=tf.identity)
+            br2 = FullyConnected('loss2fc', br2, 1024, activation=tf.nn.relu)
+            br2 = FullyConnected('loss2logit', br2, 1000, activation=tf.identity)
            loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=br2, labels=label)
            loss2 = tf.reduce_mean(loss2, name='loss2')

@@ -92,7 +92,7 @@ class Model(ModelDesc):
            l = inception('incep5b', l, 352, 192, 320, 192, 224, 128, 'max')
            l = GlobalAvgPooling('gap', l)

-            logits = FullyConnected('linear', l, out_dim=1000, nl=tf.identity)
+            logits = FullyConnected('linear', l, 1000, activation=tf.identity)
        tf.nn.softmax(logits, name='output')
        loss3 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        loss3 = tf.reduce_mean(loss3, name='loss3')

--- a/examples/ImageNetModels/shufflenet.py
+++ b/examples/ImageNetModels/shufflenet.py
@@ -28,7 +28,7 @@ TOTAL_BATCH_SIZE = 1024

 @layer_register(log_shape=True)
 def DepthConv(x, out_channel, kernel_shape, padding='SAME', stride=1,
-              W_init=None, nl=tf.identity):
+              W_init=None, activation=tf.identity):
    in_shape = x.get_shape().as_list()
    in_channel = in_shape[1]
    assert out_channel % in_channel == 0
@@ -41,7 +41,7 @@ def DepthConv(x, out_channel, kernel_shape, padding='SAME', stride=1,

    W = tf.get_variable('W', filter_shape, initializer=W_init)
    conv = tf.nn.depthwise_conv2d(x, W, [1, 1, stride, stride], padding=padding, data_format='NCHW')
-    return nl(conv, name='output')
+    return activation(conv, name='output')


 @under_name_scope()
@@ -71,13 +71,13 @@ class Model(ImageNetModel):
            # We do not apply group convolution on the first pointwise layer
            # because the number of input channels is relatively small.
            first_split = group if in_channel != 12 else 1
-            l = Conv2D('conv1', l, out_channel // 4, 1, split=first_split, nl=BNReLU)
+            l = Conv2D('conv1', l, out_channel // 4, 1, split=first_split, activation=BNReLU)
            l = channel_shuffle(l, group)
-            l = DepthConv('dconv', l, out_channel // 4, 3, nl=BN, stride=stride)
+            l = DepthConv('dconv', l, out_channel // 4, 3, activation=BN, stride=stride)

            l = Conv2D('conv2', l,
                       out_channel if stride == 1 else out_channel - in_channel,
-                       1, split=group, nl=BN)
+                       1, split=group, activation=BN)
            if stride == 1:     # unit (b)
                output = tf.nn.relu(shortcut + l)
            else:   # unit (c)
@@ -90,7 +90,7 @@ class Model(ImageNetModel):
            group = 3
            channels = [120, 240, 480]

-            l = Conv2D('conv1', image, 12, 3, stride=2, nl=BNReLU)
+            l = Conv2D('conv1', image, 12, 3, strides=2, activation=BNReLU)
            l = MaxPooling('pool1', l, 3, 2, padding='SAME')

            with tf.variable_scope('group1'):

--- a/examples/ImageNetModels/vgg16.py
+++ b/examples/ImageNetModels/vgg16.py
@@ -28,9 +28,9 @@ class Model(ImageNetModel):
    weight_decay = 5e-4

    def get_logits(self, image):
-        with argscope(Conv2D, kernel_shape=3,
-                      W_init=tf.variance_scaling_initializer(scale=2.)), \
-                argscope([Conv2D, MaxPooling, BatchNorm], data_format='NCHW'):
+        with argscope(Conv2D, kernel_size=3,
+                      kernel_initializer=tf.variance_scaling_initializer(scale=2.)), \
+                argscope([Conv2D, MaxPooling, BatchNorm], data_format='channels_first'):
            logits = (LinearWrap(image)
                      .apply(convnormrelu, 'conv1_1', 64)
                      .apply(convnormrelu, 'conv1_2', 64)
@@ -56,15 +56,15 @@ class Model(ImageNetModel):
                      .MaxPooling('pool5', 2)
                      # 7
                      .FullyConnected('fc6', 4096,
-                                      W_init=tf.random_normal_initializer(stddev=0.001))
+                                      kernel_initializer=tf.random_normal_initializer(stddev=0.001))
                      .tf.nn.relu(name='fc6_relu')
                      .Dropout('drop0', rate=0.5)
                      .FullyConnected('fc7', 4096,
-                                      W_init=tf.random_normal_initializer(stddev=0.001))
+                                      kernel_initializer=tf.random_normal_initializer(stddev=0.001))
                      .tf.nn.relu(name='fc7_relu')
                      .Dropout('drop1', rate=0.5)
                      .FullyConnected('fc8', 1000,
-                                      W_init=tf.random_normal_initializer(stddev=0.01))())
+                                      kernel_initializer=tf.random_normal_initializer(stddev=0.01))())
        add_param_summary(('.*', ['histogram', 'rms']))
        return logits


--- a/examples/basics/cifar-convnet.py
+++ b/examples/basics/cifar-convnet.py
@@ -40,26 +40,26 @@ class Model(ModelDesc):
            tf.summary.image("train_image", image, 10)
        if tf.test.is_gpu_available():
            image = tf.transpose(image, [0, 3, 1, 2])
-            data_format = 'NCHW'
+            data_format = 'channels_first'
        else:
-            data_format = 'NHWC'
+            data_format = 'channels_last'

        image = image / 4.0     # just to make range smaller
-        with argscope(Conv2D, nl=BNReLU, use_bias=False, kernel_shape=3), \
+        with argscope(Conv2D, activation=BNReLU, use_bias=False, kernel_size=3), \
                argscope([Conv2D, MaxPooling, BatchNorm], data_format=data_format):
            logits = LinearWrap(image) \
-                .Conv2D('conv1.1', out_channel=64) \
-                .Conv2D('conv1.2', out_channel=64) \
+                .Conv2D('conv1.1', filters=64) \
+                .Conv2D('conv1.2', filters=64) \
                .MaxPooling('pool1', 3, stride=2, padding='SAME') \
-                .Conv2D('conv2.1', out_channel=128) \
-                .Conv2D('conv2.2', out_channel=128) \
+                .Conv2D('conv2.1', filters=128) \
+                .Conv2D('conv2.2', filters=128) \
                .MaxPooling('pool2', 3, stride=2, padding='SAME') \
-                .Conv2D('conv3.1', out_channel=128, padding='VALID') \
-                .Conv2D('conv3.2', out_channel=128, padding='VALID') \
-                .FullyConnected('fc0', 1024 + 512, nl=tf.nn.relu) \
+                .Conv2D('conv3.1', filters=128, padding='VALID') \
+                .Conv2D('conv3.2', filters=128, padding='VALID') \
+                .FullyConnected('fc0', 1024 + 512, activation=tf.nn.relu) \
                .tf.nn.dropout(keep_prob) \
-                .FullyConnected('fc1', 512, nl=tf.nn.relu) \
-                .FullyConnected('linear', out_dim=self.cifar_classnum, nl=tf.identity)()
+                .FullyConnected('fc1', 512, activation=tf.nn.relu) \
+                .FullyConnected('linear', out_dim=self.cifar_classnum)()

        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')

--- a/examples/basics/mnist-convnet.py
+++ b/examples/basics/mnist-convnet.py
@@ -42,7 +42,7 @@ class Model(ModelDesc):
        image = image * 2 - 1   # center the pixels values at zero
        # The context manager `argscope` sets the default option for all the layers under
        # this context. Here we use 32 channel convolution with shape 3x3
-        with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu, out_channel=32):
+        with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu, filters=32):
            logits = (LinearWrap(image)
                      .Conv2D('conv0')
                      .MaxPooling('pool0', 2)

--- a/examples/basics/svhn-digit-convnet.py
+++ b/examples/basics/svhn-digit-convnet.py
@@ -31,7 +31,7 @@ class Model(ModelDesc):

        image = image / 128.0 - 1

-        with argscope(Conv2D, nl=BNReLU, use_bias=False):
+        with argscope(Conv2D, activation=BNReLU, use_bias=False):
            logits = (LinearWrap(image)
                      .Conv2D('conv1', 24, 5, padding='VALID')
                      .MaxPooling('pool1', 2, padding='SAME')
@@ -39,10 +39,11 @@ class Model(ModelDesc):
                      .Conv2D('conv3', 32, 3, padding='VALID')
                      .MaxPooling('pool2', 2, padding='SAME')
                      .Conv2D('conv4', 64, 3, padding='VALID')
-                      .Dropout('drop', 0.5)
+                      .Dropout('drop', rate=0.5)
                      .FullyConnected('fc0', 512,
-                                      b_init=tf.constant_initializer(0.1), nl=tf.nn.relu)
-                      .FullyConnected('linear', out_dim=10, nl=tf.identity)())
+                                      bias_initializer=tf.constant_initializer(0.1),
+                                      activation=tf.nn.relu)
+                      .FullyConnected('linear', units=10)())
        tf.nn.softmax(logits, name='output')

        accuracy = tf.to_float(tf.nn.in_top_k(logits, label, 1))

--- a/tensorpack/models/batch_norm.py
+++ b/tensorpack/models/batch_norm.py
@@ -73,32 +73,27 @@ def reshape_for_bn(param, ndims, chan, data_format):
        'use_bias': 'center',
        'use_scale': 'scale',
        'gamma_init': 'gamma_initializer',
-        'decay': 'momentum'
+        'decay': 'momentum',
+        'use_local_stat': 'training'
    })
-def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5,
-              scale=True, center=True,
+def BatchNorm(inputs, training=None, momentum=0.9, epsilon=1e-5,
+              center=True, scale=True,
              gamma_initializer=tf.ones_initializer(),
              data_format='channels_last',
              internal_update=False):
    """
-    Batch Normalization layer, as described in the paper:
-    `Batch Normalization: Accelerating Deep Network Training by
-    Reducing Internal Covariance Shift <http://arxiv.org/abs/1502.03167>`_.
+    Mostly equivalent to `tf.layers.batch_normalization`, but difference in
+    the following:
+
+    1. Accepts `data_format` rather than `axis`. For 2D input, this argument will be ignored.
+    2. Default value for `momentum` and `epsilon` is different.
+    3. Default value for `training` is automatically obtained from `TowerContext`.
+    4. Support the `internal_update` option.

    Args:
-        x (tf.Tensor): a 4D or 2D tensor. When 4D, the layout should match data_format.
-        use_local_stat (bool): whether to use mean/var of the current batch or the moving average.
-            Defaults to True in training and False in inference.
-        momentum (float): momentum of moving average.
-        epsilon (float): epsilon to avoid divide-by-zero.
-        scale, center (bool): whether to use the extra affine transformation or not.
-        gamma_initializer: initializer for gamma (the scale).
        internal_update (bool): if False, add EMA update ops to
            `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer
-            which will be slightly slower.
-
-    Returns:
-        tf.Tensor: a tensor named ``output`` with the same shape of x.
+            by control dependencies.

    Variable Names:

@@ -110,18 +105,18 @@ def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5,
    Note:
        1. About multi-GPU training: moving averages across GPUs are not aggregated.
           Batch statistics are computed independently.  This is consistent with most frameworks.
-        2. Combinations of ``use_local_stat`` and ``ctx.is_training``:
-            * ``use_local_stat == is_training``: standard BN, EMA are
-                maintained during training and used during inference.
-            * ``use_local_stat and not is_training``: still use local (batch)
-                statistics in inference.
-            * ``not use_local_stat and is_training``: use EMA to normalize in
+        2. Combinations of ``training`` and ``ctx.is_training``:
+            * ``training == ctx.is_training``: standard BN, EMA are
+                maintained during training and used during inference. This is
+                the default.
+            * ``training and not ctx.is_training``: still use batch statistics in inference.
+            * ``not training and ctx.is_training``: use EMA to normalize in
                training. This is useful when you load a pre-trained BN and
                don't want to fine tune the EMA. EMA will not be updated in
                this case.
    """
    data_format = get_data_format(data_format, tfmode=False)
-    shape = x.get_shape().as_list()
+    shape = inputs.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4]
    if ndims == 2:
@@ -134,17 +129,18 @@ def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5,
    beta, gamma, moving_mean, moving_var = get_bn_variables(n_out, scale, center, gamma_initializer)

    ctx = get_current_tower_context()
+    use_local_stat = training
    if use_local_stat is None:
        use_local_stat = ctx.is_training
    use_local_stat = bool(use_local_stat)

    if use_local_stat:
        if ndims == 2:
-            x = tf.reshape(x, [-1, 1, 1, n_out])    # fused_bn only takes 4D input
+            inputs = tf.reshape(inputs, [-1, 1, 1, n_out])    # fused_bn only takes 4D input
            # fused_bn has error using NCHW? (see #190)

        xn, batch_mean, batch_var = tf.nn.fused_batch_norm(
-            x, gamma, beta, epsilon=epsilon,
+            inputs, gamma, beta, epsilon=epsilon,
            is_training=True, data_format=data_format)

        if ndims == 2:
@@ -159,19 +155,19 @@ def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5,
            # Using moving_mean/moving_variance in training, which means we
            # loaded a pre-trained BN and only fine-tuning the affine part.
            xn, _, _ = tf.nn.fused_batch_norm(
-                x, gamma, beta,
+                inputs, gamma, beta,
                mean=moving_mean, variance=moving_var, epsilon=epsilon,
                data_format=data_format, is_training=False)
        else:
            if ndims == 4:
                xn, _, _ = tf.nn.fused_batch_norm(
-                    x, gamma, beta,
+                    inputs, gamma, beta,
                    mean=moving_mean, variance=moving_var, epsilon=epsilon,
                    data_format=data_format, is_training=False)
            else:
                # avoid the reshape if possible (when channel is the last dimension)
                xn = tf.nn.batch_normalization(
-                    x, moving_mean, moving_var, beta, gamma, epsilon)
+                    inputs, moving_mean, moving_var, beta, gamma, epsilon)

    # maintain EMA only on one GPU is OK, even in replicated mode.
    # because training time doesn't use EMA
@@ -201,7 +197,7 @@ def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5,
        'decay': 'momentum'
    })
 def BatchRenorm(x, rmax, dmax, momentum=0.9, epsilon=1e-5,
-                scale=True, center=True, gamma_initializer=None,
+                center=True, scale=True, gamma_initializer=None,
                data_format='channels_last'):
    """
    Batch Renormalization layer, as described in the paper:
@@ -231,8 +227,7 @@ def BatchRenorm(x, rmax, dmax, momentum=0.9, epsilon=1e-5,
    ndims = len(shape)
    assert ndims in [2, 4]
    if ndims == 2:
-        data_format = 'channels_last'    # error using NCHW? (see #190)
-        x = tf.reshape(x, [-1, 1, 1, shape[1]])
+        data_format = 'channels_first'

    ctx = get_current_tower_context()
    coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])