Commit 4744853b authored by Yuxin Wu's avatar Yuxin Wu

Migrate some examples to use tf.layers argument name convention (#627)

parent 7abf4ace
...@@ -8,8 +8,8 @@ Using the tensorpack implementations, you can also benefit from `argscope` and ` ...@@ -8,8 +8,8 @@ Using the tensorpack implementations, you can also benefit from `argscope` and `
simplify the code. simplify the code.
Note that these layers were written because there were no other alternatives back at that time. Note that these layers were written because there were no other alternatives back at that time.
In the future we may shift the implementation to `tf.layers` because they will be better maintained. Now, these layers actually call `tf.layers` directly.
You can start using `tf.layers` today as long as it fits your need. You can just use `tf.layers` as long as it fits your need.
### argscope and LinearWrap ### argscope and LinearWrap
`argscope` gives you a context with default arguments. `argscope` gives you a context with default arguments.
......
...@@ -79,19 +79,19 @@ class Model(ModelDesc): ...@@ -79,19 +79,19 @@ class Model(ModelDesc):
def _get_NN_prediction(self, image): def _get_NN_prediction(self, image):
image = tf.cast(image, tf.float32) / 255.0 image = tf.cast(image, tf.float32) / 255.0
with argscope(Conv2D, nl=tf.nn.relu): with argscope(Conv2D, activation=tf.nn.relu):
l = Conv2D('conv0', image, out_channel=32, kernel_shape=5) l = Conv2D('conv0', image, 32, 5)
l = MaxPooling('pool0', l, 2) l = MaxPooling('pool0', l, 2)
l = Conv2D('conv1', l, out_channel=32, kernel_shape=5) l = Conv2D('conv1', l, 32, 5)
l = MaxPooling('pool1', l, 2) l = MaxPooling('pool1', l, 2)
l = Conv2D('conv2', l, out_channel=64, kernel_shape=4) l = Conv2D('conv2', l, 64, 4)
l = MaxPooling('pool2', l, 2) l = MaxPooling('pool2', l, 2)
l = Conv2D('conv3', l, out_channel=64, kernel_shape=3) l = Conv2D('conv3', l, 64, 3)
l = FullyConnected('fc0', l, 512, nl=tf.identity) l = FullyConnected('fc0', l, 512)
l = PReLU('prelu', l) l = PReLU('prelu', l)
logits = FullyConnected('fc-pi', l, out_dim=NUM_ACTIONS, nl=tf.identity) # unnormalized policy logits = FullyConnected('fc-pi', l, NUM_ACTIONS) # unnormalized policy
value = FullyConnected('fc-v', l, 1, nl=tf.identity) value = FullyConnected('fc-v', l, 1)
return logits, value return logits, value
def _build_graph(self, inputs): def _build_graph(self, inputs):
......
...@@ -47,8 +47,8 @@ class Model(ModelDesc): ...@@ -47,8 +47,8 @@ class Model(ModelDesc):
# o: b x t x HIDDEN # o: b x t x HIDDEN
output = tf.reshape(outputs, [-1, HIDDEN]) # (Bxt) x rnnsize output = tf.reshape(outputs, [-1, HIDDEN]) # (Bxt) x rnnsize
logits = FullyConnected('fc', output, NR_CLASS, nl=tf.identity, logits = FullyConnected('fc', output, NR_CLASS, activation=tf.identity,
W_init=tf.truncated_normal_initializer(stddev=0.01)) kernel_initializer=tf.truncated_normal_initializer(stddev=0.01))
logits = tf.reshape(logits, (BATCH, -1, NR_CLASS)) logits = tf.reshape(logits, (BATCH, -1, NR_CLASS))
loss = tf.nn.ctc_loss(label, logits, seqlen, time_major=False) loss = tf.nn.ctc_loss(label, logits, seqlen, time_major=False)
......
...@@ -18,24 +18,24 @@ import tensorflow as tf ...@@ -18,24 +18,24 @@ import tensorflow as tf
def tower_func(image): def tower_func(image):
# img: 227x227x3 # img: 227x227x3
with argscope([Conv2D, FullyConnected], nl=tf.nn.relu): with argscope([Conv2D, FullyConnected], activation=tf.nn.relu):
l = Conv2D('conv1', image, out_channel=96, kernel_shape=11, stride=4, padding='VALID') l = Conv2D('conv1', image, filters=96, kernel_size=11, strides=4, padding='VALID')
l = tf.nn.lrn(l, 2, bias=1.0, alpha=2e-5, beta=0.75, name='norm1') l = tf.nn.lrn(l, 2, bias=1.0, alpha=2e-5, beta=0.75, name='norm1')
l = MaxPooling('pool1', l, 3, stride=2, padding='VALID') l = MaxPooling('pool1', l, 3, strides=2, padding='VALID')
l = Conv2D('conv2', l, out_channel=256, kernel_shape=5, split=2) l = Conv2D('conv2', l, filters=256, kernel_size=5, split=2)
l = tf.nn.lrn(l, 2, bias=1.0, alpha=2e-5, beta=0.75, name='norm2') l = tf.nn.lrn(l, 2, bias=1.0, alpha=2e-5, beta=0.75, name='norm2')
l = MaxPooling('pool2', l, 3, stride=2, padding='VALID') l = MaxPooling('pool2', l, 3, strides=2, padding='VALID')
l = Conv2D('conv3', l, out_channel=384, kernel_shape=3) l = Conv2D('conv3', l, filters=384, kernel_size=3)
l = Conv2D('conv4', l, out_channel=384, kernel_shape=3, split=2) l = Conv2D('conv4', l, filters=384, kernel_size=3, split=2)
l = Conv2D('conv5', l, out_channel=256, kernel_shape=3, split=2) l = Conv2D('conv5', l, filters=256, kernel_size=3, split=2)
l = MaxPooling('pool3', l, 3, stride=2, padding='VALID') l = MaxPooling('pool3', l, 3, strides=2, padding='VALID')
# This is just a script to load model, so we ignore the dropout layer # This is just a script to load model, so we ignore the dropout layer
l = FullyConnected('fc6', l, 4096) l = FullyConnected('fc6', l, 4096)
l = FullyConnected('fc7', l, out_dim=4096) l = FullyConnected('fc7', l, 4096)
logits = FullyConnected('fc8', l, out_dim=1000, nl=tf.identity) logits = FullyConnected('fc8', l, 1000)
tf.nn.softmax(logits, name='prob') tf.nn.softmax(logits, name='prob')
......
...@@ -48,9 +48,8 @@ def CPM(image): ...@@ -48,9 +48,8 @@ def CPM(image):
gmap = tf.constant(get_gaussian_map()) gmap = tf.constant(get_gaussian_map())
gmap = tf.pad(gmap, [[0, 0], [0, 1], [0, 1], [0, 0]]) gmap = tf.pad(gmap, [[0, 0], [0, 1], [0, 1], [0, 0]])
pool_center = AvgPooling('mappool', gmap, 9, stride=8, padding='VALID') pool_center = AvgPooling('mappool', gmap, 9, strides=8, padding='VALID')
with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu, with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu):
W_init=tf.random_normal_initializer(stddev=0.01)):
shared = (LinearWrap(image) shared = (LinearWrap(image)
.Conv2D('conv1_1', 64) .Conv2D('conv1_1', 64)
.Conv2D('conv1_2', 64) .Conv2D('conv1_2', 64)
...@@ -78,22 +77,20 @@ def CPM(image): ...@@ -78,22 +77,20 @@ def CPM(image):
l = tf.concat([l, shared, pool_center], 3, l = tf.concat([l, shared, pool_center], 3,
name='concat_stage{}'.format(stage)) name='concat_stage{}'.format(stage))
for i in range(1, 6): for i in range(1, 6):
l = Conv2D('Mconv{}_stage{}'.format(i, stage), l, 128) l = Conv2D('Mconv{}_stage{}'.format(i, stage), l, 128, 7, activation=tf.nn.relu)
l = Conv2D('Mconv6_stage{}'.format(stage), l, 128, kernel_shape=1) l = Conv2D('Mconv6_stage{}'.format(stage), l, 128, 1, activation=tf.nn.relu)
l = Conv2D('Mconv7_stage{}'.format(stage), l = Conv2D('Mconv7_stage{}'.format(stage), l, 15, 1, activation=tf.identity)
l, 15, kernel_shape=1, nl=tf.identity)
return l return l
with argscope(Conv2D, kernel_shape=7, nl=tf.nn.relu): out1 = (LinearWrap(shared)
out1 = (LinearWrap(shared) .Conv2D('conv5_1_CPM', 512, 1, activation=tf.nn.relu)
.Conv2D('conv5_1_CPM', 512, kernel_shape=1) .Conv2D('conv5_2_CPM', 15, 1, activation=tf.identity)())
.Conv2D('conv5_2_CPM', 15, kernel_shape=1, nl=tf.identity)()) out2 = add_stage(2, out1)
out2 = add_stage(2, out1) out3 = add_stage(3, out2)
out3 = add_stage(3, out2) out4 = add_stage(4, out3)
out4 = add_stage(4, out3) out5 = add_stage(5, out4)
out5 = add_stage(5, out4) out6 = add_stage(6, out5)
out6 = add_stage(6, out5) tf.image.resize_bilinear(out6, [368, 368], name='resized_map')
tf.image.resize_bilinear(out6, [368, 368], name='resized_map')
def run_test(model_path, img_file): def run_test(model_path, img_file):
......
...@@ -17,7 +17,7 @@ from tensorpack.dataflow.dataset import ILSVRCMeta ...@@ -17,7 +17,7 @@ from tensorpack.dataflow.dataset import ILSVRCMeta
def tower_func(image): def tower_func(image):
with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu): with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu):
logits = (LinearWrap(image) logits = (LinearWrap(image)
.Conv2D('conv1_1', 64) .Conv2D('conv1_1', 64)
.Conv2D('conv1_2', 64) .Conv2D('conv1_2', 64)
...@@ -42,11 +42,11 @@ def tower_func(image): ...@@ -42,11 +42,11 @@ def tower_func(image):
.Conv2D('conv5_3', 512) .Conv2D('conv5_3', 512)
.MaxPooling('pool5', 2) .MaxPooling('pool5', 2)
# 7 # 7
.FullyConnected('fc6', 4096, nl=tf.nn.relu) .FullyConnected('fc6', 4096, activation=tf.nn.relu)
.Dropout('drop0', 0.5) .Dropout('drop0', 0.5)
.FullyConnected('fc7', 4096, nl=tf.nn.relu) .FullyConnected('fc7', 4096, activation=tf.nn.relu)
.Dropout('drop1', 0.5) .Dropout('drop1', 0.5)
.FullyConnected('fc8', out_dim=1000, nl=tf.identity)()) .FullyConnected('fc8', 1000)())
tf.nn.softmax(logits, name='prob') tf.nn.softmax(logits, name='prob')
......
...@@ -16,7 +16,7 @@ from tensorpack.dataflow.dataset import ILSVRCMeta ...@@ -16,7 +16,7 @@ from tensorpack.dataflow.dataset import ILSVRCMeta
def tower_func(image): def tower_func(image):
with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu): with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu):
logits = (LinearWrap(image) logits = (LinearWrap(image)
.Conv2D('conv1_1', 64) .Conv2D('conv1_1', 64)
.Conv2D('conv1_2', 64) .Conv2D('conv1_2', 64)
...@@ -44,11 +44,11 @@ def tower_func(image): ...@@ -44,11 +44,11 @@ def tower_func(image):
.Conv2D('conv5_4', 512) .Conv2D('conv5_4', 512)
.MaxPooling('pool5', 2) .MaxPooling('pool5', 2)
# 7 # 7
.FullyConnected('fc6', 4096, nl=tf.nn.relu) .FullyConnected('fc6', 4096, activation=tf.nn.relu)
.Dropout('drop0', 0.5) .Dropout('drop0', 0.5)
.FullyConnected('fc7', 4096, nl=tf.nn.relu) .FullyConnected('fc7', 4096, activation=tf.nn.relu)
.Dropout('drop1', 0.5) .Dropout('drop1', 0.5)
.FullyConnected('fc8', out_dim=1000, nl=tf.identity)()) .FullyConnected('fc8', 1000)())
tf.nn.softmax(logits, name='prob') tf.nn.softmax(logits, name='prob')
......
...@@ -99,7 +99,7 @@ class Model(ModelDesc): ...@@ -99,7 +99,7 @@ class Model(ModelDesc):
# seqlen x (Bxrnnsize) # seqlen x (Bxrnnsize)
output = tf.reshape(tf.concat(outputs, 1), [-1, param.rnn_size]) # (Bxseqlen) x rnnsize output = tf.reshape(tf.concat(outputs, 1), [-1, param.rnn_size]) # (Bxseqlen) x rnnsize
logits = FullyConnected('fc', output, param.vocab_size, nl=tf.identity) logits = FullyConnected('fc', output, param.vocab_size, activation=tf.identity)
tf.nn.softmax(logits / param.softmax_temprature, name='prob') tf.nn.softmax(logits / param.softmax_temprature, name='prob')
xent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( xent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
......
...@@ -57,9 +57,9 @@ class Model(DQNModel): ...@@ -57,9 +57,9 @@ class Model(DQNModel):
with argscope(Conv2D, activation=lambda x: PReLU('prelu', x), use_bias=True): with argscope(Conv2D, activation=lambda x: PReLU('prelu', x), use_bias=True):
l = (LinearWrap(image) l = (LinearWrap(image)
# Nature architecture # Nature architecture
.Conv2D('conv0', out_channel=32, kernel_shape=8, stride=4) .Conv2D('conv0', 32, 8, strides=4)
.Conv2D('conv1', out_channel=64, kernel_shape=4, stride=2) .Conv2D('conv1', 64, 4, strides=2)
.Conv2D('conv2', out_channel=64, kernel_shape=3) .Conv2D('conv2', 64, 3)
# architecture used for the figure in the README, slower but takes fewer iterations to converge # architecture used for the figure in the README, slower but takes fewer iterations to converge
# .Conv2D('conv0', out_channel=32, kernel_shape=5) # .Conv2D('conv0', out_channel=32, kernel_shape=5)
...@@ -73,11 +73,11 @@ class Model(DQNModel): ...@@ -73,11 +73,11 @@ class Model(DQNModel):
.FullyConnected('fc0', 512) .FullyConnected('fc0', 512)
.tf.nn.leaky_relu(alpha=0.01)()) .tf.nn.leaky_relu(alpha=0.01)())
if self.method != 'Dueling': if self.method != 'Dueling':
Q = FullyConnected('fct', l, self.num_actions, nl=tf.identity) Q = FullyConnected('fct', l, self.num_actions)
else: else:
# Dueling DQN # Dueling DQN
V = FullyConnected('fctV', l, 1, nl=tf.identity) V = FullyConnected('fctV', l, 1)
As = FullyConnected('fctA', l, self.num_actions, nl=tf.identity) As = FullyConnected('fctA', l, self.num_actions)
Q = tf.add(As, V - tf.reduce_mean(As, 1, keep_dims=True)) Q = tf.add(As, V - tf.reduce_mean(As, 1, keep_dims=True))
return tf.identity(Q, name='Qvalue') return tf.identity(Q, name='Qvalue')
......
...@@ -33,14 +33,13 @@ class Model(mnist_example.Model): ...@@ -33,14 +33,13 @@ class Model(mnist_example.Model):
image, label = inputs image, label = inputs
image = tf.expand_dims(image, 3) image = tf.expand_dims(image, 3)
with argscope(Conv2D, kernel_shape=5, nl=tf.nn.relu): logits = (LinearWrap(image) # the starting brace is oactivationy for line-breaking
logits = (LinearWrap(image) # the starting brace is only for line-breaking .Conv2D('conv0', 32, 5, padding='VALID', activation=tf.nn.relu)
.Conv2D('conv0', out_channel=32, padding='VALID') .MaxPooling('pool0', 2)
.MaxPooling('pool0', 2) .Conv2D('conv1', 64, 5, padding='VALID', activation=tf.nn.relu)
.Conv2D('conv1', out_channel=64, padding='VALID') .MaxPooling('pool1', 2)
.MaxPooling('pool1', 2) .FullyConnected('fc0', 512, activation=tf.nn.relu)
.FullyConnected('fc0', 512, nl=tf.nn.relu) .FullyConnected('fc1', out_dim=10, activation=tf.identity)())
.FullyConnected('fc1', out_dim=10, nl=tf.identity)())
tf.nn.softmax(logits, name='prob') tf.nn.softmax(logits, name='prob')
wrong = symbolic_functions.prediction_incorrect(logits, label) wrong = symbolic_functions.prediction_incorrect(logits, label)
......
...@@ -106,10 +106,10 @@ class Model(ModelDesc): ...@@ -106,10 +106,10 @@ class Model(ModelDesc):
return fa(nonlin(x)) return fa(nonlin(x))
with remap_variables(new_get_variable), \ with remap_variables(new_get_variable), \
argscope(BatchNorm, decay=0.9, epsilon=1e-4), \ argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \
argscope([Conv2D, FullyConnected], use_bias=False, nl=tf.identity): argscope(Conv2D, use_bias=False):
logits = (LinearWrap(image) logits = (LinearWrap(image)
.Conv2D('conv0', 96, 12, stride=4, padding='VALID') .Conv2D('conv0', 96, 12, strides=4, padding='VALID')
.apply(activate) .apply(activate)
.Conv2D('conv1', 256, 5, padding='SAME', split=2) .Conv2D('conv1', 256, 5, padding='SAME', split=2)
.apply(fg) .apply(fg)
...@@ -139,7 +139,7 @@ class Model(ModelDesc): ...@@ -139,7 +139,7 @@ class Model(ModelDesc):
.BatchNorm('bnfc0') .BatchNorm('bnfc0')
.apply(activate) .apply(activate)
.FullyConnected('fc1', 4096) .FullyConnected('fc1', 4096, use_bias=False)
.apply(fg) .apply(fg)
.BatchNorm('bnfc1') .BatchNorm('bnfc1')
.apply(nonlin) .apply(nonlin)
......
...@@ -72,8 +72,8 @@ class Model(ModelDesc): ...@@ -72,8 +72,8 @@ class Model(ModelDesc):
image = image / 256.0 image = image / 256.0
with remap_variables(binarize_weight), \ with remap_variables(binarize_weight), \
argscope(BatchNorm, decay=0.9, epsilon=1e-4), \ argscope(BatchNorm, momentum=0.9, epsilon=1e-4), \
argscope(Conv2D, use_bias=False, nl=tf.identity): argscope(Conv2D, use_bias=False):
logits = (LinearWrap(image) logits = (LinearWrap(image)
.Conv2D('conv0', 48, 5, padding='VALID', use_bias=True) .Conv2D('conv0', 48, 5, padding='VALID', use_bias=True)
.MaxPooling('pool0', 2, padding='SAME') .MaxPooling('pool0', 2, padding='SAME')
...@@ -106,7 +106,7 @@ class Model(ModelDesc): ...@@ -106,7 +106,7 @@ class Model(ModelDesc):
.Conv2D('conv6', 512, 5, padding='VALID') .Conv2D('conv6', 512, 5, padding='VALID')
.apply(fg).BatchNorm('bn6') .apply(fg).BatchNorm('bn6')
.apply(cabs) .apply(cabs)
.FullyConnected('fc1', 10, nl=tf.identity)()) .FullyConnected('fc1', 10)())
tf.nn.softmax(logits, name='output') tf.nn.softmax(logits, name='output')
# compute the number of failed samples # compute the number of failed samples
......
...@@ -55,16 +55,15 @@ class Model(ModelDesc): ...@@ -55,16 +55,15 @@ class Model(ModelDesc):
def branch(name, l, up): def branch(name, l, up):
with tf.variable_scope(name): with tf.variable_scope(name):
l = Conv2D('convfc', l, 1, kernel_shape=1, nl=tf.identity, l = Conv2D('convfc', l, 1, kernel_size=1, activation=tf.identity,
use_bias=True, use_bias=True,
W_init=tf.constant_initializer(), kernel_initializer=tf.constant_initializer())
b_init=tf.constant_initializer())
while up != 1: while up != 1:
l = BilinearUpSample('upsample{}'.format(up), l, 2) l = BilinearUpSample('upsample{}'.format(up), l, 2)
up = up / 2 up = up / 2
return l return l
with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu): with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu):
l = Conv2D('conv1_1', image, 64) l = Conv2D('conv1_1', image, 64)
l = Conv2D('conv1_2', l, 64) l = Conv2D('conv1_2', l, 64)
b1 = branch('branch1', l, 1) b1 = branch('branch1', l, 1)
...@@ -93,9 +92,9 @@ class Model(ModelDesc): ...@@ -93,9 +92,9 @@ class Model(ModelDesc):
b5 = branch('branch5', l, 16) b5 = branch('branch5', l, 16)
final_map = Conv2D('convfcweight', final_map = Conv2D('convfcweight',
tf.concat([b1, b2, b3, b4, b5], 3), 1, 1, tf.concat([b1, b2, b3, b4, b5], 3), 1, kernel_size=1,
W_init=tf.constant_initializer(0.2), kernel_initializer=tf.constant_initializer(0.2),
use_bias=False, nl=tf.identity) use_bias=False, activation=tf.identity)
costs = [] costs = []
for idx, b in enumerate([b1, b2, b3, b4, b5, final_map]): for idx, b in enumerate([b1, b2, b3, b4, b5, final_map]):
output = tf.nn.sigmoid(b, name='output{}'.format(idx + 1)) output = tf.nn.sigmoid(b, name='output{}'.format(idx + 1))
......
...@@ -39,11 +39,11 @@ class Model(ModelDesc): ...@@ -39,11 +39,11 @@ class Model(ModelDesc):
if nr1x1 != 0: if nr1x1 != 0:
outs.append(Conv2D('conv1x1', x, nr1x1, 1)) outs.append(Conv2D('conv1x1', x, nr1x1, 1))
x2 = Conv2D('conv3x3r', x, nr3x3r, 1) x2 = Conv2D('conv3x3r', x, nr3x3r, 1)
outs.append(Conv2D('conv3x3', x2, nr3x3, 3, stride=stride)) outs.append(Conv2D('conv3x3', x2, nr3x3, 3, strides=stride))
x3 = Conv2D('conv233r', x, nr233r, 1) x3 = Conv2D('conv233r', x, nr233r, 1)
x3 = Conv2D('conv233a', x3, nr233, 3) x3 = Conv2D('conv233a', x3, nr233, 3)
outs.append(Conv2D('conv233b', x3, nr233, 3, stride=stride)) outs.append(Conv2D('conv233b', x3, nr233, 3, strides=stride))
if pooltype == 'max': if pooltype == 'max':
x4 = MaxPooling('mpool', x, 3, stride, padding='SAME') x4 = MaxPooling('mpool', x, 3, stride, padding='SAME')
...@@ -55,9 +55,9 @@ class Model(ModelDesc): ...@@ -55,9 +55,9 @@ class Model(ModelDesc):
outs.append(x4) outs.append(x4)
return tf.concat(outs, 3, name='concat') return tf.concat(outs, 3, name='concat')
with argscope(Conv2D, nl=BNReLU, use_bias=False): with argscope(Conv2D, activation=BNReLU, use_bias=False):
l = (LinearWrap(image) l = (LinearWrap(image)
.Conv2D('conv0', 64, 7, stride=2) .Conv2D('conv0', 64, 7, strides=2)
.MaxPooling('pool0', 3, 2, padding='SAME') .MaxPooling('pool0', 3, 2, padding='SAME')
.Conv2D('conv1', 64, 1) .Conv2D('conv1', 64, 1)
.Conv2D('conv2', 192, 3) .Conv2D('conv2', 192, 3)
...@@ -69,8 +69,8 @@ class Model(ModelDesc): ...@@ -69,8 +69,8 @@ class Model(ModelDesc):
br1 = (LinearWrap(l) br1 = (LinearWrap(l)
.Conv2D('loss1conv', 128, 1) .Conv2D('loss1conv', 128, 1)
.FullyConnected('loss1fc', 1024, nl=tf.nn.relu) .FullyConnected('loss1fc', 1024, activation=tf.nn.relu)
.FullyConnected('loss1logit', 1000, nl=tf.identity)()) .FullyConnected('loss1logit', 1000, activation=tf.identity)())
loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=br1, labels=label) loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=br1, labels=label)
loss1 = tf.reduce_mean(loss1, name='loss1') loss1 = tf.reduce_mean(loss1, name='loss1')
...@@ -82,8 +82,8 @@ class Model(ModelDesc): ...@@ -82,8 +82,8 @@ class Model(ModelDesc):
l = inception('incep4e', l, 0, 128, 192, 192, 256, 0, 'max') l = inception('incep4e', l, 0, 128, 192, 192, 256, 0, 'max')
br2 = Conv2D('loss2conv', l, 128, 1) br2 = Conv2D('loss2conv', l, 128, 1)
br2 = FullyConnected('loss2fc', br2, 1024, nl=tf.nn.relu) br2 = FullyConnected('loss2fc', br2, 1024, activation=tf.nn.relu)
br2 = FullyConnected('loss2logit', br2, 1000, nl=tf.identity) br2 = FullyConnected('loss2logit', br2, 1000, activation=tf.identity)
loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=br2, labels=label) loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=br2, labels=label)
loss2 = tf.reduce_mean(loss2, name='loss2') loss2 = tf.reduce_mean(loss2, name='loss2')
...@@ -92,7 +92,7 @@ class Model(ModelDesc): ...@@ -92,7 +92,7 @@ class Model(ModelDesc):
l = inception('incep5b', l, 352, 192, 320, 192, 224, 128, 'max') l = inception('incep5b', l, 352, 192, 320, 192, 224, 128, 'max')
l = GlobalAvgPooling('gap', l) l = GlobalAvgPooling('gap', l)
logits = FullyConnected('linear', l, out_dim=1000, nl=tf.identity) logits = FullyConnected('linear', l, 1000, activation=tf.identity)
tf.nn.softmax(logits, name='output') tf.nn.softmax(logits, name='output')
loss3 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) loss3 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
loss3 = tf.reduce_mean(loss3, name='loss3') loss3 = tf.reduce_mean(loss3, name='loss3')
......
...@@ -28,7 +28,7 @@ TOTAL_BATCH_SIZE = 1024 ...@@ -28,7 +28,7 @@ TOTAL_BATCH_SIZE = 1024
@layer_register(log_shape=True) @layer_register(log_shape=True)
def DepthConv(x, out_channel, kernel_shape, padding='SAME', stride=1, def DepthConv(x, out_channel, kernel_shape, padding='SAME', stride=1,
W_init=None, nl=tf.identity): W_init=None, activation=tf.identity):
in_shape = x.get_shape().as_list() in_shape = x.get_shape().as_list()
in_channel = in_shape[1] in_channel = in_shape[1]
assert out_channel % in_channel == 0 assert out_channel % in_channel == 0
...@@ -41,7 +41,7 @@ def DepthConv(x, out_channel, kernel_shape, padding='SAME', stride=1, ...@@ -41,7 +41,7 @@ def DepthConv(x, out_channel, kernel_shape, padding='SAME', stride=1,
W = tf.get_variable('W', filter_shape, initializer=W_init) W = tf.get_variable('W', filter_shape, initializer=W_init)
conv = tf.nn.depthwise_conv2d(x, W, [1, 1, stride, stride], padding=padding, data_format='NCHW') conv = tf.nn.depthwise_conv2d(x, W, [1, 1, stride, stride], padding=padding, data_format='NCHW')
return nl(conv, name='output') return activation(conv, name='output')
@under_name_scope() @under_name_scope()
...@@ -71,13 +71,13 @@ class Model(ImageNetModel): ...@@ -71,13 +71,13 @@ class Model(ImageNetModel):
# We do not apply group convolution on the first pointwise layer # We do not apply group convolution on the first pointwise layer
# because the number of input channels is relatively small. # because the number of input channels is relatively small.
first_split = group if in_channel != 12 else 1 first_split = group if in_channel != 12 else 1
l = Conv2D('conv1', l, out_channel // 4, 1, split=first_split, nl=BNReLU) l = Conv2D('conv1', l, out_channel // 4, 1, split=first_split, activation=BNReLU)
l = channel_shuffle(l, group) l = channel_shuffle(l, group)
l = DepthConv('dconv', l, out_channel // 4, 3, nl=BN, stride=stride) l = DepthConv('dconv', l, out_channel // 4, 3, activation=BN, stride=stride)
l = Conv2D('conv2', l, l = Conv2D('conv2', l,
out_channel if stride == 1 else out_channel - in_channel, out_channel if stride == 1 else out_channel - in_channel,
1, split=group, nl=BN) 1, split=group, activation=BN)
if stride == 1: # unit (b) if stride == 1: # unit (b)
output = tf.nn.relu(shortcut + l) output = tf.nn.relu(shortcut + l)
else: # unit (c) else: # unit (c)
...@@ -90,7 +90,7 @@ class Model(ImageNetModel): ...@@ -90,7 +90,7 @@ class Model(ImageNetModel):
group = 3 group = 3
channels = [120, 240, 480] channels = [120, 240, 480]
l = Conv2D('conv1', image, 12, 3, stride=2, nl=BNReLU) l = Conv2D('conv1', image, 12, 3, strides=2, activation=BNReLU)
l = MaxPooling('pool1', l, 3, 2, padding='SAME') l = MaxPooling('pool1', l, 3, 2, padding='SAME')
with tf.variable_scope('group1'): with tf.variable_scope('group1'):
......
...@@ -28,9 +28,9 @@ class Model(ImageNetModel): ...@@ -28,9 +28,9 @@ class Model(ImageNetModel):
weight_decay = 5e-4 weight_decay = 5e-4
def get_logits(self, image): def get_logits(self, image):
with argscope(Conv2D, kernel_shape=3, with argscope(Conv2D, kernel_size=3,
W_init=tf.variance_scaling_initializer(scale=2.)), \ kernel_initializer=tf.variance_scaling_initializer(scale=2.)), \
argscope([Conv2D, MaxPooling, BatchNorm], data_format='NCHW'): argscope([Conv2D, MaxPooling, BatchNorm], data_format='channels_first'):
logits = (LinearWrap(image) logits = (LinearWrap(image)
.apply(convnormrelu, 'conv1_1', 64) .apply(convnormrelu, 'conv1_1', 64)
.apply(convnormrelu, 'conv1_2', 64) .apply(convnormrelu, 'conv1_2', 64)
...@@ -56,15 +56,15 @@ class Model(ImageNetModel): ...@@ -56,15 +56,15 @@ class Model(ImageNetModel):
.MaxPooling('pool5', 2) .MaxPooling('pool5', 2)
# 7 # 7
.FullyConnected('fc6', 4096, .FullyConnected('fc6', 4096,
W_init=tf.random_normal_initializer(stddev=0.001)) kernel_initializer=tf.random_normal_initializer(stddev=0.001))
.tf.nn.relu(name='fc6_relu') .tf.nn.relu(name='fc6_relu')
.Dropout('drop0', rate=0.5) .Dropout('drop0', rate=0.5)
.FullyConnected('fc7', 4096, .FullyConnected('fc7', 4096,
W_init=tf.random_normal_initializer(stddev=0.001)) kernel_initializer=tf.random_normal_initializer(stddev=0.001))
.tf.nn.relu(name='fc7_relu') .tf.nn.relu(name='fc7_relu')
.Dropout('drop1', rate=0.5) .Dropout('drop1', rate=0.5)
.FullyConnected('fc8', 1000, .FullyConnected('fc8', 1000,
W_init=tf.random_normal_initializer(stddev=0.01))()) kernel_initializer=tf.random_normal_initializer(stddev=0.01))())
add_param_summary(('.*', ['histogram', 'rms'])) add_param_summary(('.*', ['histogram', 'rms']))
return logits return logits
......
...@@ -40,26 +40,26 @@ class Model(ModelDesc): ...@@ -40,26 +40,26 @@ class Model(ModelDesc):
tf.summary.image("train_image", image, 10) tf.summary.image("train_image", image, 10)
if tf.test.is_gpu_available(): if tf.test.is_gpu_available():
image = tf.transpose(image, [0, 3, 1, 2]) image = tf.transpose(image, [0, 3, 1, 2])
data_format = 'NCHW' data_format = 'channels_first'
else: else:
data_format = 'NHWC' data_format = 'channels_last'
image = image / 4.0 # just to make range smaller image = image / 4.0 # just to make range smaller
with argscope(Conv2D, nl=BNReLU, use_bias=False, kernel_shape=3), \ with argscope(Conv2D, activation=BNReLU, use_bias=False, kernel_size=3), \
argscope([Conv2D, MaxPooling, BatchNorm], data_format=data_format): argscope([Conv2D, MaxPooling, BatchNorm], data_format=data_format):
logits = LinearWrap(image) \ logits = LinearWrap(image) \
.Conv2D('conv1.1', out_channel=64) \ .Conv2D('conv1.1', filters=64) \
.Conv2D('conv1.2', out_channel=64) \ .Conv2D('conv1.2', filters=64) \
.MaxPooling('pool1', 3, stride=2, padding='SAME') \ .MaxPooling('pool1', 3, stride=2, padding='SAME') \
.Conv2D('conv2.1', out_channel=128) \ .Conv2D('conv2.1', filters=128) \
.Conv2D('conv2.2', out_channel=128) \ .Conv2D('conv2.2', filters=128) \
.MaxPooling('pool2', 3, stride=2, padding='SAME') \ .MaxPooling('pool2', 3, stride=2, padding='SAME') \
.Conv2D('conv3.1', out_channel=128, padding='VALID') \ .Conv2D('conv3.1', filters=128, padding='VALID') \
.Conv2D('conv3.2', out_channel=128, padding='VALID') \ .Conv2D('conv3.2', filters=128, padding='VALID') \
.FullyConnected('fc0', 1024 + 512, nl=tf.nn.relu) \ .FullyConnected('fc0', 1024 + 512, activation=tf.nn.relu) \
.tf.nn.dropout(keep_prob) \ .tf.nn.dropout(keep_prob) \
.FullyConnected('fc1', 512, nl=tf.nn.relu) \ .FullyConnected('fc1', 512, activation=tf.nn.relu) \
.FullyConnected('linear', out_dim=self.cifar_classnum, nl=tf.identity)() .FullyConnected('linear', out_dim=self.cifar_classnum)()
cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
cost = tf.reduce_mean(cost, name='cross_entropy_loss') cost = tf.reduce_mean(cost, name='cross_entropy_loss')
......
...@@ -42,7 +42,7 @@ class Model(ModelDesc): ...@@ -42,7 +42,7 @@ class Model(ModelDesc):
image = image * 2 - 1 # center the pixels values at zero image = image * 2 - 1 # center the pixels values at zero
# The context manager `argscope` sets the default option for all the layers under # The context manager `argscope` sets the default option for all the layers under
# this context. Here we use 32 channel convolution with shape 3x3 # this context. Here we use 32 channel convolution with shape 3x3
with argscope(Conv2D, kernel_shape=3, nl=tf.nn.relu, out_channel=32): with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu, filters=32):
logits = (LinearWrap(image) logits = (LinearWrap(image)
.Conv2D('conv0') .Conv2D('conv0')
.MaxPooling('pool0', 2) .MaxPooling('pool0', 2)
......
...@@ -31,7 +31,7 @@ class Model(ModelDesc): ...@@ -31,7 +31,7 @@ class Model(ModelDesc):
image = image / 128.0 - 1 image = image / 128.0 - 1
with argscope(Conv2D, nl=BNReLU, use_bias=False): with argscope(Conv2D, activation=BNReLU, use_bias=False):
logits = (LinearWrap(image) logits = (LinearWrap(image)
.Conv2D('conv1', 24, 5, padding='VALID') .Conv2D('conv1', 24, 5, padding='VALID')
.MaxPooling('pool1', 2, padding='SAME') .MaxPooling('pool1', 2, padding='SAME')
...@@ -39,10 +39,11 @@ class Model(ModelDesc): ...@@ -39,10 +39,11 @@ class Model(ModelDesc):
.Conv2D('conv3', 32, 3, padding='VALID') .Conv2D('conv3', 32, 3, padding='VALID')
.MaxPooling('pool2', 2, padding='SAME') .MaxPooling('pool2', 2, padding='SAME')
.Conv2D('conv4', 64, 3, padding='VALID') .Conv2D('conv4', 64, 3, padding='VALID')
.Dropout('drop', 0.5) .Dropout('drop', rate=0.5)
.FullyConnected('fc0', 512, .FullyConnected('fc0', 512,
b_init=tf.constant_initializer(0.1), nl=tf.nn.relu) bias_initializer=tf.constant_initializer(0.1),
.FullyConnected('linear', out_dim=10, nl=tf.identity)()) activation=tf.nn.relu)
.FullyConnected('linear', units=10)())
tf.nn.softmax(logits, name='output') tf.nn.softmax(logits, name='output')
accuracy = tf.to_float(tf.nn.in_top_k(logits, label, 1)) accuracy = tf.to_float(tf.nn.in_top_k(logits, label, 1))
......
...@@ -73,32 +73,27 @@ def reshape_for_bn(param, ndims, chan, data_format): ...@@ -73,32 +73,27 @@ def reshape_for_bn(param, ndims, chan, data_format):
'use_bias': 'center', 'use_bias': 'center',
'use_scale': 'scale', 'use_scale': 'scale',
'gamma_init': 'gamma_initializer', 'gamma_init': 'gamma_initializer',
'decay': 'momentum' 'decay': 'momentum',
'use_local_stat': 'training'
}) })
def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5, def BatchNorm(inputs, training=None, momentum=0.9, epsilon=1e-5,
scale=True, center=True, center=True, scale=True,
gamma_initializer=tf.ones_initializer(), gamma_initializer=tf.ones_initializer(),
data_format='channels_last', data_format='channels_last',
internal_update=False): internal_update=False):
""" """
Batch Normalization layer, as described in the paper: Mostly equivalent to `tf.layers.batch_normalization`, but difference in
`Batch Normalization: Accelerating Deep Network Training by the following:
Reducing Internal Covariance Shift <http://arxiv.org/abs/1502.03167>`_.
1. Accepts `data_format` rather than `axis`. For 2D input, this argument will be ignored.
2. Default value for `momentum` and `epsilon` is different.
3. Default value for `training` is automatically obtained from `TowerContext`.
4. Support the `internal_update` option.
Args: Args:
x (tf.Tensor): a 4D or 2D tensor. When 4D, the layout should match data_format.
use_local_stat (bool): whether to use mean/var of the current batch or the moving average.
Defaults to True in training and False in inference.
momentum (float): momentum of moving average.
epsilon (float): epsilon to avoid divide-by-zero.
scale, center (bool): whether to use the extra affine transformation or not.
gamma_initializer: initializer for gamma (the scale).
internal_update (bool): if False, add EMA update ops to internal_update (bool): if False, add EMA update ops to
`tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer
which will be slightly slower. by control dependencies.
Returns:
tf.Tensor: a tensor named ``output`` with the same shape of x.
Variable Names: Variable Names:
...@@ -110,18 +105,18 @@ def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5, ...@@ -110,18 +105,18 @@ def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5,
Note: Note:
1. About multi-GPU training: moving averages across GPUs are not aggregated. 1. About multi-GPU training: moving averages across GPUs are not aggregated.
Batch statistics are computed independently. This is consistent with most frameworks. Batch statistics are computed independently. This is consistent with most frameworks.
2. Combinations of ``use_local_stat`` and ``ctx.is_training``: 2. Combinations of ``training`` and ``ctx.is_training``:
* ``use_local_stat == is_training``: standard BN, EMA are * ``training == ctx.is_training``: standard BN, EMA are
maintained during training and used during inference. maintained during training and used during inference. This is
* ``use_local_stat and not is_training``: still use local (batch) the default.
statistics in inference. * ``training and not ctx.is_training``: still use batch statistics in inference.
* ``not use_local_stat and is_training``: use EMA to normalize in * ``not training and ctx.is_training``: use EMA to normalize in
training. This is useful when you load a pre-trained BN and training. This is useful when you load a pre-trained BN and
don't want to fine tune the EMA. EMA will not be updated in don't want to fine tune the EMA. EMA will not be updated in
this case. this case.
""" """
data_format = get_data_format(data_format, tfmode=False) data_format = get_data_format(data_format, tfmode=False)
shape = x.get_shape().as_list() shape = inputs.get_shape().as_list()
ndims = len(shape) ndims = len(shape)
assert ndims in [2, 4] assert ndims in [2, 4]
if ndims == 2: if ndims == 2:
...@@ -134,17 +129,18 @@ def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5, ...@@ -134,17 +129,18 @@ def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5,
beta, gamma, moving_mean, moving_var = get_bn_variables(n_out, scale, center, gamma_initializer) beta, gamma, moving_mean, moving_var = get_bn_variables(n_out, scale, center, gamma_initializer)
ctx = get_current_tower_context() ctx = get_current_tower_context()
use_local_stat = training
if use_local_stat is None: if use_local_stat is None:
use_local_stat = ctx.is_training use_local_stat = ctx.is_training
use_local_stat = bool(use_local_stat) use_local_stat = bool(use_local_stat)
if use_local_stat: if use_local_stat:
if ndims == 2: if ndims == 2:
x = tf.reshape(x, [-1, 1, 1, n_out]) # fused_bn only takes 4D input inputs = tf.reshape(inputs, [-1, 1, 1, n_out]) # fused_bn only takes 4D input
# fused_bn has error using NCHW? (see #190) # fused_bn has error using NCHW? (see #190)
xn, batch_mean, batch_var = tf.nn.fused_batch_norm( xn, batch_mean, batch_var = tf.nn.fused_batch_norm(
x, gamma, beta, epsilon=epsilon, inputs, gamma, beta, epsilon=epsilon,
is_training=True, data_format=data_format) is_training=True, data_format=data_format)
if ndims == 2: if ndims == 2:
...@@ -159,19 +155,19 @@ def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5, ...@@ -159,19 +155,19 @@ def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5,
# Using moving_mean/moving_variance in training, which means we # Using moving_mean/moving_variance in training, which means we
# loaded a pre-trained BN and only fine-tuning the affine part. # loaded a pre-trained BN and only fine-tuning the affine part.
xn, _, _ = tf.nn.fused_batch_norm( xn, _, _ = tf.nn.fused_batch_norm(
x, gamma, beta, inputs, gamma, beta,
mean=moving_mean, variance=moving_var, epsilon=epsilon, mean=moving_mean, variance=moving_var, epsilon=epsilon,
data_format=data_format, is_training=False) data_format=data_format, is_training=False)
else: else:
if ndims == 4: if ndims == 4:
xn, _, _ = tf.nn.fused_batch_norm( xn, _, _ = tf.nn.fused_batch_norm(
x, gamma, beta, inputs, gamma, beta,
mean=moving_mean, variance=moving_var, epsilon=epsilon, mean=moving_mean, variance=moving_var, epsilon=epsilon,
data_format=data_format, is_training=False) data_format=data_format, is_training=False)
else: else:
# avoid the reshape if possible (when channel is the last dimension) # avoid the reshape if possible (when channel is the last dimension)
xn = tf.nn.batch_normalization( xn = tf.nn.batch_normalization(
x, moving_mean, moving_var, beta, gamma, epsilon) inputs, moving_mean, moving_var, beta, gamma, epsilon)
# maintain EMA only on one GPU is OK, even in replicated mode. # maintain EMA only on one GPU is OK, even in replicated mode.
# because training time doesn't use EMA # because training time doesn't use EMA
...@@ -201,7 +197,7 @@ def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5, ...@@ -201,7 +197,7 @@ def BatchNorm(x, use_local_stat=None, momentum=0.9, epsilon=1e-5,
'decay': 'momentum' 'decay': 'momentum'
}) })
def BatchRenorm(x, rmax, dmax, momentum=0.9, epsilon=1e-5, def BatchRenorm(x, rmax, dmax, momentum=0.9, epsilon=1e-5,
scale=True, center=True, gamma_initializer=None, center=True, scale=True, gamma_initializer=None,
data_format='channels_last'): data_format='channels_last'):
""" """
Batch Renormalization layer, as described in the paper: Batch Renormalization layer, as described in the paper:
...@@ -231,8 +227,7 @@ def BatchRenorm(x, rmax, dmax, momentum=0.9, epsilon=1e-5, ...@@ -231,8 +227,7 @@ def BatchRenorm(x, rmax, dmax, momentum=0.9, epsilon=1e-5,
ndims = len(shape) ndims = len(shape)
assert ndims in [2, 4] assert ndims in [2, 4]
if ndims == 2: if ndims == 2:
data_format = 'channels_last' # error using NCHW? (see #190) data_format = 'channels_first'
x = tf.reshape(x, [-1, 1, 1, shape[1]])
ctx = get_current_tower_context() ctx = get_current_tower_context()
coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment