Commit 4744853b authored by Yuxin Wu's avatar Yuxin Wu

Migrate some examples to use tf.layers argument name convention (#627)

parent 7abf4ace
......@@ -8,8 +8,8 @@ Using the tensorpack implementations, you can also benefit from `argscope` and `
simplify the code.
Note that these layers were written because there were no other alternatives back at that time.
In the future we may shift the implementation to `tf.layers` because they will be better maintained.
You can start using `tf.layers` today as long as it fits your need.
Now, these layers actually call `tf.layers` directly.
You can just use `tf.layers` as long as it fits your need.
### argscope and LinearWrap
`argscope` gives you a context with default arguments.
......
......@@ -79,19 +79,19 @@ class Model(ModelDesc):
def _get_NN_prediction(self, image):
image = tf.cast(image, tf.float32) / 255.0
with argscope(Conv2D, nl=tf.nn.relu):
l = Conv2D('conv0', image, out_channel=32, kernel_shape=5)
with argscope(Conv2D, activation=tf.nn.relu):
l = Conv2D('conv0', image, 32, 5)
l = MaxPooling('pool0', l, 2)
l = Conv2D('conv1', l, out_channel=32, kernel_shape=5)
l = Conv2D('conv1', l, 32, 5)
l = MaxPooling('pool1', l, 2)
l = Conv2D('conv2', l, out_channel=64, kernel_shape=4)
l = Conv2D('conv2', l, 64, 4)
l = MaxPooling('pool2', l, 2)
l = Conv2D('conv3', l, out_channel=64, kernel_shape=3)
l = Conv2D('conv3', l, 64, 3)
l = FullyConnected('fc0', l, 512, nl=tf.identity)
l = FullyConnected('fc0', l, 512)
l = PReLU('prelu', l)
logits = FullyConnected('fc-pi', l, out_dim=NUM_ACTIONS, nl=tf.identity) # unnormalized policy
value = FullyConnected('fc-v', l, 1, nl=tf.identity)
logits = FullyConnected('fc-pi', l, NUM_ACTIONS) # unnormalized policy
value = FullyConnected('fc-v', l, 1)
return logits, value
def _build_graph(self, inputs):
......
......@@ -47,8 +47,8 @@ class Model(ModelDesc):
# o: b x t x HIDDEN
output = tf.reshape(outputs, [-1, HIDDEN]) # (Bxt) x rnnsize
logits = FullyConnected('fc', output, NR_CLASS, nl=tf.identity,
W_init=tf.truncated_normal_initializer(stddev=0.01))
logits = FullyConnected('fc', output, NR_CLASS, activation=tf.identity,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.01))
logits = tf.reshape(logits, (BATCH, -1, NR_CLASS))
loss = tf.nn.ctc_loss(label, logits, seqlen, time_major=False)
......
......@@ -18,24 +18,24 @@ import tensorflow as tf
def tower_func(image):
# img: 227x227x3
with argscope([Conv2D, FullyConnected], nl=tf.nn.relu):
l = Conv2D('conv1', image, out_channel=96, kernel_shape=11, stride=4, padding='VALID')
with argscope([Conv2D, FullyConnected], activation=tf.nn.relu):
l = Conv2D('conv1', image, filters=96, kernel_size=11, strides=4, padding='VALID')
l = tf.nn.lrn(l, 2, bias=1.0, alpha=2e-5, beta=0.75, name='norm1')
l = MaxPooling('pool1', l, 3, stride=2, padding='VALID')
l = MaxPooling('pool1', l, 3, strides=2, padding='VALID')
l = Conv2D('conv2', l, out_channel=256, kernel_shape=5, split=2)
l = Conv2D('conv2', l, filters=256, kernel_size=5, split=2)
l = tf.nn.lrn(l, 2, bias=1.0, alpha=2e-5, beta=0.75, name='norm2')
l = MaxPooling('pool2', l, 3, stride=2, padding='VALID')
l = MaxPooling('pool2', l, 3, strides=2, padding='VALID')
l = Conv2D('conv3', l, out_channel=384, kernel_shape=3)
l = Conv2D('conv4', l, out_channel=384, kernel_shape=3, split=2)
l = Conv2D('conv5', l, out_channel=256, kernel_shape=3, split=2)
l = MaxPooling('pool3', l, 3, stride=2, padding='VALID')
l = Conv2D('conv3', l, filters=384, kernel_size=3)
l = Conv2D('conv4', l, filters=384, kernel_size=3, split=2)
l = Conv2D('conv5', l, filters=256, kernel_size=3, split=2)
l = MaxPooling('pool3', l, 3, strides=2, padding='VALID')
# This is just a script to load model, so we ignore the dropout layer
l = FullyConnected('fc6', l, 4096)
l = FullyConnected('fc7', l, out_dim=4096)
logits = FullyConnected('fc8', l, out_dim=1000, nl=tf.identity)
l = FullyConnected('fc7', l, 4096)
logits = FullyConnected('fc8', l, 1000)
tf.nn.softmax(logits, name='prob')
......
......@@ -48,9 +48,8 @@ def CPM(image):
gmap = tf.constant(get_gaussian_map())
gmap = tf.pad(gmap, [[0, 0], [0, 1], [0, 1], [0, 0]])
pool_center = AvgPooling('mappool', gmap, 9, stride=8, padding='VALID')
with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu,
W_init=tf.random_normal_initializer(stddev=0.01)):
pool_center = AvgPooling('mappool', gmap, 9, strides=8, padding='VALID')
with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu):
shared = (LinearWrap(image)
.Conv2D('conv1_1', 64)
.Conv2D('conv1_2', 64)
......@@ -78,16 +77,14 @@ def CPM(image):
l = tf.concat([l, shared, pool_center], 3,
name='concat_stage{}'.format(stage))
for i in range(1, 6):
l = Conv2D('Mconv{}_stage{}'.format(i, stage), l, 128)
l = Conv2D('Mconv6_stage{}'.format(stage), l, 128, kernel_shape=1)
l = Conv2D('Mconv7_stage{}'.format(stage),
l, 15, kernel_shape=1, nl=tf.identity)
l = Conv2D('Mconv{}_stage{}'.format(i, stage), l, 128, 7, activation=tf.nn.relu)
l = Conv2D('Mconv6_stage{}'.format(stage), l, 128, 1, activation=tf.nn.relu)
l = Conv2D('Mconv7_stage{}'.format(stage), l, 15, 1, activation=tf.identity)
return l
with argscope(Conv2D, kernel_shape=7, nl=tf.nn.relu):
out1 = (LinearWrap(shared)
.Conv2D('conv5_1_CPM', 512, kernel_shape=1)
.Conv2D('conv5_2_CPM', 15, kernel_shape=1, nl=tf.identity)())
.Conv2D('conv5_1_CPM', 512, 1, activation=tf.nn.relu)
.Conv2D('conv5_2_CPM', 15, 1, activation=tf.identity)())
out2 = add_stage(2, out1)
out3 = add_stage(3, out2)
out4 = add_stage(4, out3)
......
......@@ -17,7 +17,7 @@ from tensorpack.dataflow.dataset import ILSVRCMeta
def tower_func(image):
with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu):
with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu):
logits = (LinearWrap(image)
.Conv2D('conv1_1', 64)
.Conv2D('conv1_2', 64)
......@@ -42,11 +42,11 @@ def tower_func(image):
.Conv2D('conv5_3', 512)
.MaxPooling('pool5', 2)
# 7
.FullyConnected('fc6', 4096, nl=tf.nn.relu)
.FullyConnected('fc6', 4096, activation=tf.nn.relu)
.Dropout('drop0', 0.5)
.FullyConnected('fc7', 4096, nl=tf.nn.relu)
.FullyConnected('fc7', 4096, activation=tf.nn.relu)
.Dropout('drop1', 0.5)
.FullyConnected('fc8', out_dim=1000, nl=tf.identity)())
.FullyConnected('fc8', 1000)())
tf.nn.softmax(logits, name='prob')
......
......@@ -16,7 +16,7 @@ from tensorpack.dataflow.dataset import ILSVRCMeta
def tower_func(image):
with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu):
with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu):
logits = (LinearWrap(image)
.Conv2D('conv1_1', 64)
.Conv2D('conv1_2', 64)
......@@ -44,11 +44,11 @@ def tower_func(image):
.Conv2D('conv5_4', 512)
.MaxPooling('pool5', 2)
# 7
.FullyConnected('fc6', 4096, nl=tf.nn.relu)
.FullyConnected('fc6', 4096, activation=tf.nn.relu)
.Dropout('drop0', 0.5)
.FullyConnected('fc7', 4096, nl=tf.nn.relu)
.FullyConnected('fc7', 4096, activation=tf.nn.relu)
.Dropout('drop1', 0.5)
.FullyConnected('fc8', out_dim=1000, nl=tf.identity)())
.FullyConnected('fc8', 1000)())
tf.nn.softmax(logits, name='prob')
......
......@@ -99,7 +99,7 @@ class Model(ModelDesc):
# seqlen x (Bxrnnsize)
output = tf.reshape(tf.concat(outputs, 1), [-1, param.rnn_size]) # (Bxseqlen) x rnnsize
logits = FullyConnected('fc', output, param.vocab_size, nl=tf.identity)
logits = FullyConnected('fc', output, param.vocab_size, activation=tf.identity)
tf.nn.softmax(logits / param.softmax_temprature, name='prob')
xent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
......
......@@ -57,9 +57,9 @@ class Model(DQNModel):
with argscope(Conv2D, activation=lambda x: PReLU('prelu', x), use_bias=True):
l = (LinearWrap(image)
# Nature architecture
.Conv2D('conv0', out_channel=32, kernel_shape=8, stride=4)
.Conv2D('conv1', out_channel=64, kernel_shape=4, stride=2)
.Conv2D('conv2', out_channel=64, kernel_shape=3)
.Conv2D('conv0', 32, 8, strides=4)
.Conv2D('conv1', 64, 4, strides=2)
.Conv2D('conv2', 64, 3)
# architecture used for the figure in the README, slower but takes fewer iterations to converge
# .Conv2D('conv0', out_channel=32, kernel_shape=5)
......@@ -73,11 +73,11 @@ class Model(DQNModel):
.FullyConnected('fc0', 512)
.tf.nn.leaky_relu(alpha=0.01)())
if self.method != 'Dueling':
Q = FullyConnected('fct', l, self.num_actions, nl=tf.identity)
Q = FullyConnected('fct', l, self.num_actions)
else:
# Dueling DQN
V = FullyConnected('fctV', l, 1, nl=tf.identity)
As = FullyConnected('fctA', l, self.num_actions, nl=tf.identity)
V = FullyConnected('fctV', l, 1)
As = FullyConnected('fctA', l, self.num_actions)
Q = tf.add(As, V - tf.reduce_mean(As, 1, keep_dims=True))
return tf.identity(Q, name='Qvalue')
......
......@@ -33,14 +33,13 @@ class Model(mnist_example.Model):
image, label = inputs
image = tf.expand_dims(image, 3)
with argscope(Conv2D, kernel_shape=5, nl=tf.nn.relu):
logits = (LinearWrap(image) # the starting brace is only for line-breaking
.Conv2D('conv0', out_channel=32, padding='VALID')
logits = (LinearWrap(image) # the starting brace is oactivationy for line-breaking
.Conv2D('conv0', 32, 5, padding='VALID', activation=tf.nn.relu)
.MaxPooling('pool0', 2)
.Conv2D('conv1', out_channel=64, padding='VALID')
.Conv2D('conv1', 64, 5, padding='VALID', activation=tf.nn.relu)
.MaxPooling('pool1', 2)
.FullyConnected('fc0', 512, nl=tf.nn.relu)
.FullyConnected('fc1', out_dim=10, nl=tf.identity)())
.FullyConnected('fc0', 512, activation=tf.nn.relu)
.FullyConnected('fc1', out_dim=10, activation=tf.identity)())
tf.nn.softmax(logits, name='prob')
wrong = symbolic_functions.prediction_incorrect(logits, label)
......
......@@ -106,10 +106,10 @@ class Model(ModelDesc):
return fa(nonlin(x))
with remap_variables(new_get_variable), \
argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity):
argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \
argscope(Conv2D, use_bias=False):
logits = (LinearWrap(image)
.Conv2D('conv0', 96, 12, stride=4, padding='VALID')
.Conv2D('conv0', 96, 12, strides=4, padding='VALID')
.apply(activate)
.Conv2D('conv1', 256, 5, padding='SAME', split=2)
.apply(fg)
......@@ -139,7 +139,7 @@ class Model(ModelDesc):
.BatchNorm('bnfc0')
.apply(activate)
.FullyConnected('fc1', 4096)
.FullyConnected('fc1', 4096, use_bias=False)
.apply(fg)
.BatchNorm('bnfc1')
.apply(nonlin)
......
......@@ -72,8 +72,8 @@ class Model(ModelDesc):
image = image / 256.0
with remap_variables(binarize_weight), \
argscope(BatchNorm, decay=0.9, epsilon=1e-4), \
argscope(Conv2D, use_bias=False, nl=tf.identity):
argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \
argscope(Conv2D, use_bias=False):
logits = (LinearWrap(image)
.Conv2D('conv0', 48, 5, padding='VALID', use_bias=True)
.MaxPooling('pool0', 2, padding='SAME')
......@@ -106,7 +106,7 @@ class Model(ModelDesc):
.Conv2D('conv6', 512, 5, padding='VALID')
.apply(fg).BatchNorm('bn6')
.apply(cabs)
.FullyConnected('fc1', 10, nl=tf.identity)())
.FullyConnected('fc1', 10)())
tf.nn.softmax(logits, name='output')
# compute the number of failed samples
......
......@@ -55,16 +55,15 @@ class Model(ModelDesc):
def branch(name, l, up):
with tf.variable_scope(name):
l = Conv2D('convfc', l, 1, kernel_shape=1, nl=tf.identity,
l = Conv2D('convfc', l, 1, kernel_size=1, activation=tf.identity,
use_bias=True,
W_init=tf.constant_initializer(),
b_init=tf.constant_initializer())
kernel_initializer=tf.constant_initializer())
while up != 1:
l = BilinearUpSample('upsample{}'.format(up), l, 2)
up = up / 2
return l
with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu):
with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu):
l = Conv2D('conv1_1', image, 64)
l = Conv2D('conv1_2', l, 64)
b1 = branch('branch1', l, 1)
......@@ -93,9 +92,9 @@ class Model(ModelDesc):
b5 = branch('branch5', l, 16)
final_map = Conv2D('convfcweight',
tf.concat([b1, b2, b3, b4, b5], 3), 1, 1,
W_init=tf.constant_initializer(0.2),
use_bias=False, nl=tf.identity)
tf.concat([b1, b2, b3, b4, b5], 3), 1, kernel_size=1,
kernel_initializer=tf.constant_initializer(0.2),
use_bias=False, activation=tf.identity)
costs = []
for idx, b in enumerate([b1, b2, b3, b4, b5, final_map]):
output = tf.nn.sigmoid(b, name='output{}'.format(idx + 1))
......
......@@ -39,11 +39,11 @@ class Model(ModelDesc):
if nr1x1 != 0:
outs.append(Conv2D('conv1x1', x, nr1x1, 1))
x2 = Conv2D('conv3x3r', x, nr3x3r, 1)
outs.append(Conv2D('conv3x3', x2, nr3x3, 3, stride=stride))
outs.append(Conv2D('conv3x3', x2, nr3x3, 3, strides=stride))
x3 = Conv2D('conv233r', x, nr233r, 1)
x3 = Conv2D('conv233a', x3, nr233, 3)
outs.append(Conv2D('conv233b', x3, nr233, 3, stride=stride))
outs.append(Conv2D('conv233b', x3, nr233, 3, strides=stride))
if pooltype == 'max':
x4 = MaxPooling('mpool', x, 3, stride, padding='SAME')
......@@ -55,9 +55,9 @@ class Model(ModelDesc):
outs.append(x4)
return tf.concat(outs, 3, name='concat')
with argscope(Conv2D, nl=BNReLU, use_bias=False):
with argscope(Conv2D, activation=BNReLU, use_bias=False):
l = (LinearWrap(image)
.Conv2D('conv0', 64, 7, stride=2)
.Conv2D('conv0', 64, 7, strides=2)
.MaxPooling('pool0', 3, 2, padding='SAME')
.Conv2D('conv1', 64, 1)
.Conv2D('conv2', 192, 3)
......@@ -69,8 +69,8 @@ class Model(ModelDesc):
br1 = (LinearWrap(l)
.Conv2D('loss1conv', 128, 1)
.FullyConnected('loss1fc', 1024, nl=tf.nn.relu)
.FullyConnected('loss1logit', 1000, nl=tf.identity)())
.FullyConnected('loss1fc', 1024, activation=tf.nn.relu)
.FullyConnected('loss1logit', 1000, activation=tf.identity)())
loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=br1, labels=label)
loss1 = tf.reduce_mean(loss1, name='loss1')
......@@ -82,8 +82,8 @@ class Model(ModelDesc):
l = inception('incep4e', l, 0, 128, 192, 192, 256, 0, 'max')
br2 = Conv2D('loss2conv', l, 128, 1)
br2 = FullyConnected('loss2fc', br2, 1024, nl=tf.nn.relu)
br2 = FullyConnected('loss2logit', br2, 1000, nl=tf.identity)
br2 = FullyConnected('loss2fc', br2, 1024, activation=tf.nn.relu)
br2 = FullyConnected('loss2logit', br2, 1000, activation=tf.identity)
loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=br2, labels=label)
loss2 = tf.reduce_mean(loss2, name='loss2')
......@@ -92,7 +92,7 @@ class Model(ModelDesc):
l = inception('incep5b', l, 352, 192, 320, 192, 224, 128, 'max')
l = GlobalAvgPooling('gap', l)
logits = FullyConnected('linear', l, out_dim=1000, nl=tf.identity)
logits = FullyConnected('linear', l, 1000, activation=tf.identity)
tf.nn.softmax(logits, name='output')
loss3 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
loss3 = tf.reduce_mean(loss3, name='loss3')
......
......@@ -28,7 +28,7 @@ TOTAL_BATCH_SIZE = 1024
@layer_register(log_shape=True)
def DepthConv(x, out_channel, kernel_shape, padding='SAME', stride=1,
W_init=None, nl=tf.identity):
W_init=None, activation=tf.identity):
in_shape = x.get_shape().as_list()
in_channel = in_shape[1]
assert out_channel % in_channel == 0
......@@ -41,7 +41,7 @@ def DepthConv(x, out_channel, kernel_shape, padding='SAME', stride=1,
W = tf.get_variable('W', filter_shape, initializer=W_init)
conv = tf.nn.depthwise_conv2d(x, W, [1, 1, stride, stride], padding=padding, data_format='NCHW')
return nl(conv, name='output')
return activation(conv, name='output')
@under_name_scope()
......@@ -71,13 +71,13 @@ class Model(ImageNetModel):
# We do not apply group convolution on the first pointwise layer
# because the number of input channels is relatively small.
first_split = group if in_channel != 12 else 1
l = Conv2D('conv1', l, out_channel // 4, 1, split=first_split, nl=BNReLU)
l = Conv2D('conv1', l, out_channel // 4, 1, split=first_split, activation=BNReLU)
l = channel_shuffle(l, group)
l = DepthConv('dconv', l, out_channel // 4, 3, nl=BN, stride=stride)
l = DepthConv('dconv', l, out_channel // 4, 3, activation=BN, stride=stride)
l = Conv2D('conv2', l,
out_channel if stride == 1 else out_channel - in_channel,
1, split=group, nl=BN)
1, split=group, activation=BN)
if stride == 1: # unit (b)
output = tf.nn.relu(shortcut + l)
else: # unit (c)
......@@ -90,7 +90,7 @@ class Model(ImageNetModel):
group = 3
channels = [120, 240, 480]
l = Conv2D('conv1', image, 12, 3, stride=2, nl=BNReLU)
l = Conv2D('conv1', image, 12, 3, strides=2, activation=BNReLU)
l = MaxPooling('pool1', l, 3, 2, padding='SAME')
with tf.variable_scope('group1'):
......
......@@ -28,9 +28,9 @@ class Model(ImageNetModel):
weight_decay = 5e-4
def get_logits(self, image):
with argscope(Conv2D, kernel_shape=3,
W_init=tf.variance_scaling_initializer(scale=2.)), \
argscope([Conv2D, MaxPooling, BatchNorm], data_format='NCHW'):
with argscope(Conv2D, kernel_size=3,
kernel_initializer=tf.variance_scaling_initializer(scale=2.)), \
argscope([Conv2D, MaxPooling, BatchNorm], data_format='channels_first'):
logits = (LinearWrap(image)
.apply(convnormrelu, 'conv1_1', 64)
.apply(convnormrelu, 'conv1_2', 64)
......@@ -56,15 +56,15 @@ class Model(ImageNetModel):
.MaxPooling('pool5', 2)
# 7
.FullyConnected('fc6', 4096,
W_init=tf.random_normal_initializer(stddev=0.001))
kernel_initializer=tf.random_normal_initializer(stddev=0.001))
.tf.nn.relu(name='fc6_relu')
.Dropout('drop0', rate=0.5)
.FullyConnected('fc7', 4096,
W_init=tf.random_normal_initializer(stddev=0.001))
kernel_initializer=tf.random_normal_initializer(stddev=0.001))
.tf.nn.relu(name='fc7_relu')
.Dropout('drop1', rate=0.5)
.FullyConnected('fc8', 1000,
W_init=tf.random_normal_initializer(stddev=0.01))())
kernel_initializer=tf.random_normal_initializer(stddev=0.01))())
add_param_summary(('.*', ['histogram', 'rms']))
return logits
......
......@@ -40,26 +40,26 @@ class Model(ModelDesc):
tf.summary.image("train_image", image, 10)
if tf.test.is_gpu_available():
image = tf.transpose(image, [0, 3, 1, 2])
data_format = 'NCHW'
data_format = 'channels_first'
else:
data_format = 'NHWC'
data_format = 'channels_last'
image = image / 4.0 # just to make range smaller
with argscope(Conv2D, nl=BNReLU, use_bias=False, kernel_shape=3), \
with argscope(Conv2D, activation=BNReLU, use_bias=False, kernel_size=3), \
argscope([Conv2D, MaxPooling, BatchNorm], data_format=data_format):
logits = LinearWrap(image) \
.Conv2D('conv1.1', out_channel=64) \
.Conv2D('conv1.2', out_channel=64) \
.Conv2D('conv1.1', filters=64) \
.Conv2D('conv1.2', filters=64) \
.MaxPooling('pool1', 3, stride=2, padding='SAME') \
.Conv2D('conv2.1', out_channel=128) \
.Conv2D('conv2.2', out_channel=128) \
.Conv2D('conv2.1', filters=128) \
.Conv2D('conv2.2', filters=128) \
.MaxPooling('pool2', 3, stride=2, padding='SAME') \
.Conv2D('conv3.1', out_channel=128, padding='VALID') \
.Conv2D('conv3.2', out_channel=128, padding='VALID') \
.FullyConnected('fc0', 1024 + 512, nl=tf.nn.relu) \
.Conv2D('conv3.1', filters=128, padding='VALID') \
.Conv2D('conv3.2', filters=128, padding='VALID') \
.FullyConnected('fc0', 1024 + 512, activation=tf.nn.relu) \
.tf.nn.dropout(keep_prob) \
.FullyConnected('fc1', 512, nl=tf.nn.relu) \
.FullyConnected('linear', out_dim=self.cifar_classnum, nl=tf.identity)()
.FullyConnected('fc1', 512, activation=tf.nn.relu) \
.FullyConnected('linear', out_dim=self.cifar_classnum)()
cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
cost = tf.reduce_mean(cost, name='cross_entropy_loss')
......
......@@ -42,7 +42,7 @@ class Model(ModelDesc):
image = image * 2 - 1 # center the pixels values at zero
# The context manager `argscope` sets the default option for all the layers under
# this context. Here we use 32 channel convolution with shape 3x3
with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu, out_channel=32):
with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu, filters=32):
logits = (LinearWrap(image)
.Conv2D('conv0')
.MaxPooling('pool0', 2)
......
......@@ -31,7 +31,7 @@ class Model(ModelDesc):
image = image / 128.0 - 1
with argscope(Conv2D, nl=BNReLU, use_bias=False):
with argscope(Conv2D, activation=BNReLU, use_bias=False):
logits = (LinearWrap(image)
.Conv2D('conv1', 24, 5, padding='VALID')
.MaxPooling('pool1', 2, padding='SAME')
......@@ -39,10 +39,11 @@ class Model(ModelDesc):
.Conv2D('conv3', 32, 3, padding='VALID')
.MaxPooling('pool2', 2, padding='SAME')
.Conv2D('conv4', 64, 3, padding='VALID')
.Dropout('drop', 0.5)
.Dropout('drop', rate=0.5)
.FullyConnected('fc0', 512,
b_init=tf.constant_initializer(0.1), nl=tf.nn.relu)
.FullyConnected('linear', out_dim=10, nl=tf.identity)())
bias_initializer=tf.constant_initializer(0.1),
activation=tf.nn.relu)
.FullyConnected('linear', units=10)())
tf.nn.softmax(logits, name='output')
accuracy = tf.to_float(tf.nn.in_top_k(logits, label, 1))
......
......@@ -73,32 +73,27 @@ def reshape_for_bn(param, ndims, chan, data_format):
'use_bias': 'center',
'use_scale': 'scale',
'gamma_init': 'gamma_initializer',
'decay': 'momentum'
'decay': 'momentum',
'use_local_stat': 'training'
})
def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5,
scale=True, center=True,
def BatchNorm(inputs, training=None, momentum=0.9, epsilon=1e-5,
center=True, scale=True,
gamma_initializer=tf.ones_initializer(),
data_format='channels_last',
internal_update=False):
"""
Batch Normalization layer, as described in the paper:
`Batch Normalization: Accelerating Deep Network Training by
Reducing Internal Covariance Shift <http://arxiv.org/abs/1502.03167>`_.
Mostly equivalent to `tf.layers.batch_normalization`, but difference in
the following:
1. Accepts `data_format` rather than `axis`. For 2D input, this argument will be ignored.
2. Default value for `momentum` and `epsilon` is different.
3. Default value for `training` is automatically obtained from `TowerContext`.
4. Support the `internal_update` option.
Args:
x (tf.Tensor): a 4D or 2D tensor. When 4D, the layout should match data_format.
use_local_stat (bool): whether to use mean/var of the current batch or the moving average.
Defaults to True in training and False in inference.
momentum (float): momentum of moving average.
epsilon (float): epsilon to avoid divide-by-zero.
scale, center (bool): whether to use the extra affine transformation or not.
gamma_initializer: initializer for gamma (the scale).
internal_update (bool): if False, add EMA update ops to
`tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer
which will be slightly slower.
Returns:
tf.Tensor: a tensor named ``output`` with the same shape of x.
by control dependencies.
Variable Names:
......@@ -110,18 +105,18 @@ def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5,
Note:
1. About multi-GPU training: moving averages across GPUs are not aggregated.
Batch statistics are computed independently. This is consistent with most frameworks.
2. Combinations of ``use_local_stat`` and ``ctx.is_training``:
* ``use_local_stat == is_training``: standard BN, EMA are
maintained during training and used during inference.
* ``use_local_stat and not is_training``: still use local (batch)
statistics in inference.
* ``not use_local_stat and is_training``: use EMA to normalize in
2. Combinations of ``training`` and ``ctx.is_training``:
* ``training == ctx.is_training``: standard BN, EMA are
maintained during training and used during inference. This is
the default.
* ``training and not ctx.is_training``: still use batch statistics in inference.
* ``not training and ctx.is_training``: use EMA to normalize in
training. This is useful when you load a pre-trained BN and
don't want to fine tune the EMA. EMA will not be updated in
this case.
"""
data_format = get_data_format(data_format, tfmode=False)
shape = x.get_shape().as_list()
shape = inputs.get_shape().as_list()
ndims = len(shape)
assert ndims in [2, 4]
if ndims == 2:
......@@ -134,17 +129,18 @@ def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5,
beta, gamma, moving_mean, moving_var = get_bn_variables(n_out, scale, center, gamma_initializer)
ctx = get_current_tower_context()
use_local_stat = training
if use_local_stat is None:
use_local_stat = ctx.is_training
use_local_stat = bool(use_local_stat)
if use_local_stat:
if ndims == 2:
x = tf.reshape(x, [-1, 1, 1, n_out]) # fused_bn only takes 4D input
inputs = tf.reshape(inputs, [-1, 1, 1, n_out]) # fused_bn only takes 4D input
# fused_bn has error using NCHW? (see #190)
xn, batch_mean, batch_var = tf.nn.fused_batch_norm(
x, gamma, beta, epsilon=epsilon,
inputs, gamma, beta, epsilon=epsilon,
is_training=True, data_format=data_format)
if ndims == 2:
......@@ -159,19 +155,19 @@ def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5,
# Using moving_mean/moving_variance in training, which means we
# loaded a pre-trained BN and only fine-tuning the affine part.
xn, _, _ = tf.nn.fused_batch_norm(
x, gamma, beta,
inputs, gamma, beta,
mean=moving_mean, variance=moving_var, epsilon=epsilon,
data_format=data_format, is_training=False)
else:
if ndims == 4:
xn, _, _ = tf.nn.fused_batch_norm(
x, gamma, beta,
inputs, gamma, beta,
mean=moving_mean, variance=moving_var, epsilon=epsilon,
data_format=data_format, is_training=False)
else:
# avoid the reshape if possible (when channel is the last dimension)
xn = tf.nn.batch_normalization(
x, moving_mean, moving_var, beta, gamma, epsilon)
inputs, moving_mean, moving_var, beta, gamma, epsilon)
# maintain EMA only on one GPU is OK, even in replicated mode.
# because training time doesn't use EMA
......@@ -201,7 +197,7 @@ def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5,
'decay': 'momentum'
})
def BatchRenorm(x, rmax, dmax, momentum=0.9, epsilon=1e-5,
scale=True, center=True, gamma_initializer=None,
center=True, scale=True, gamma_initializer=None,
data_format='channels_last'):
"""
Batch Renormalization layer, as described in the paper:
......@@ -231,8 +227,7 @@ def BatchRenorm(x, rmax, dmax, momentum=0.9, epsilon=1e-5,
ndims = len(shape)
assert ndims in [2, 4]
if ndims == 2:
data_format = 'channels_last' # error using NCHW? (see #190)
x = tf.reshape(x, [-1, 1, 1, shape[1]])
data_format = 'channels_first'
ctx = get_current_tower_context()
coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment