fix deprecations about casting & initializers in tf1.13

c7fd1d9f · Yuxin Wu · be39dbdf · c7fd1d9f · c7fd1d9f · c7fd1d9f
Commit c7fd1d9f authored Dec 19, 2018 by Yuxin Wu
23 changed files
--- a/examples/CTC-TIMIT/train-timit.py
+++ b/examples/CTC-TIMIT/train-timit.py
@@ -59,11 +59,11 @@ class Model(ModelDesc):
        isTrain = get_current_tower_context().is_training
        if isTrain:
            # beam search is too slow to run in training
-            predictions = tf.to_int32(
+            predictions = tf.cast(
-                tf.nn.ctc_greedy_decoder(logits, seqlen)[0][0])
+                tf.nn.ctc_greedy_decoder(logits, seqlen)[0][0], tf.int32)
        else:
-            predictions = tf.to_int32(
+            predictions = tf.cast(
-                tf.nn.ctc_beam_search_decoder(logits, seqlen)[0][0])
+                tf.nn.ctc_beam_search_decoder(logits, seqlen)[0][0], tf.int32)
        err = tf.edit_distance(predictions, label, normalize=True)
        err.set_shape([None])
        err = tf.reduce_mean(err, name='error')

--- a/examples/FasterRCNN/basemodel.py
+++ b/examples/FasterRCNN/basemodel.py
@@ -192,7 +192,7 @@ def resnet_fpn_backbone(image, num_blocks):
    freeze_at = cfg.BACKBONE.FREEZE_AT
    shape2d = tf.shape(image)[2:]
    mult = float(cfg.FPN.RESOLUTION_REQUIREMENT)
-    new_shape2d = tf.to_int32(tf.ceil(tf.to_float(shape2d) / mult) * mult)
+    new_shape2d = tf.cast(tf.ceil(tf.cast(shape2d, tf.float32) / mult) * mult, tf.int32)
    pad_shape2d = new_shape2d - shape2d
    assert len(num_blocks) == 4, num_blocks
    with backbone_scope(freeze=freeze_at > 0):

--- a/examples/FasterRCNN/model_box.py
+++ b/examples/FasterRCNN/model_box.py
@@ -19,7 +19,7 @@ def clip_boxes(boxes, window, name=None):
    """
    boxes = tf.maximum(boxes, 0.0)
    m = tf.tile(tf.reverse(window, [0]), [2])    # (4,)
-    boxes = tf.minimum(boxes, tf.to_float(m), name=name)
+    boxes = tf.minimum(boxes, tf.cast(m, tf.float32), name=name)
    return boxes
@@ -122,14 +122,15 @@ def crop_and_resize(image, boxes, box_ind, crop_size, pad_border=True):
        """
        x0, y0, x1, y1 = tf.split(boxes, 4, axis=1)
-        spacing_w = (x1 - x0) / tf.to_float(crop_shape[1])
+        spacing_w = (x1 - x0) / tf.cast(crop_shape[1], tf.float32)
-        spacing_h = (y1 - y0) / tf.to_float(crop_shape[0])
+        spacing_h = (y1 - y0) / tf.cast(crop_shape[0], tf.float32)
-        nx0 = (x0 + spacing_w / 2 - 0.5) / tf.to_float(image_shape[1] - 1)
+        imshape = [tf.cast(image_shape[0] - 1, tf.float32), tf.cast(image_shape[1] - 1, tf.float32)]
-        ny0 = (y0 + spacing_h / 2 - 0.5) / tf.to_float(image_shape[0] - 1)
+        nx0 = (x0 + spacing_w / 2 - 0.5) / imshape[1]
+        ny0 = (y0 + spacing_h / 2 - 0.5) / imshape[0]
-        nw = spacing_w * tf.to_float(crop_shape[1] - 1) / tf.to_float(image_shape[1] - 1)
+        nw = spacing_w * tf.cast(crop_shape[1] - 1, tf.float32) / imshape[1]
-        nh = spacing_h * tf.to_float(crop_shape[0] - 1) / tf.to_float(image_shape[0] - 1)
+        nh = spacing_h * tf.cast(crop_shape[0] - 1, tf.float32) / imshape[0]
        return tf.concat([ny0, nx0, ny0 + nh, nx0 + nw], axis=1)
@@ -146,7 +147,7 @@ def crop_and_resize(image, boxes, box_ind, crop_size, pad_border=True):
    boxes = transform_fpcoor_for_tf(boxes, image_shape, [crop_size, crop_size])
    image = tf.transpose(image, [0, 2, 3, 1])   # nhwc
    ret = tf.image.crop_and_resize(
-        image, boxes, tf.to_int32(box_ind),
+        image, boxes, tf.cast(box_ind, tf.int32),
        crop_size=[crop_size, crop_size])
    ret = tf.transpose(ret, [0, 3, 1, 2])   # ncss
    return ret

--- a/examples/FasterRCNN/model_cascade.py
+++ b/examples/FasterRCNN/model_cascade.py
@@ -88,7 +88,7 @@ class CascadeRCNNHead(object):
                labels_per_box = tf.gather(self.gt_labels, best_iou_ind)
                fg_mask = max_iou_per_box >= iou_threshold
                fg_inds_wrt_gt = tf.boolean_mask(best_iou_ind, fg_mask)
-                labels_per_box = tf.stop_gradient(labels_per_box * tf.to_int64(fg_mask))
+                labels_per_box = tf.stop_gradient(labels_per_box * tf.cast(fg_mask, tf.int64))
                return BoxProposals(boxes, labels_per_box, fg_inds_wrt_gt)
        else:
            return BoxProposals(boxes)

--- a/examples/FasterRCNN/model_fpn.py
+++ b/examples/FasterRCNN/model_fpn.py
@@ -82,8 +82,8 @@ def fpn_map_rois_to_levels(boxes):
    Be careful that the returned tensor could be empty.
    """
    sqrtarea = tf.sqrt(tf_area(boxes))
-    level = tf.to_int32(tf.floor(
+    level = tf.cast(tf.floor(
-        4 + tf.log(sqrtarea * (1. / 224) + 1e-6) * (1.0 / np.log(2))))
+        4 + tf.log(sqrtarea * (1. / 224) + 1e-6) * (1.0 / np.log(2))), tf.int32)
    # RoI levels range from 2~5 (not 6)
    level_ids = [

--- a/examples/FasterRCNN/model_frcnn.py
+++ b/examples/FasterRCNN/model_frcnn.py
@@ -154,22 +154,22 @@ def fastrcnn_losses(labels, label_logits, fg_boxes, fg_box_logits):
    with tf.name_scope('label_metrics'), tf.device('/cpu:0'):
        prediction = tf.argmax(label_logits, axis=1, name='label_prediction')
-        correct = tf.to_float(tf.equal(prediction, labels))  # boolean/integer gather is unavailable on GPU
+        correct = tf.cast(tf.equal(prediction, labels), tf.float32)  # boolean/integer gather is unavailable on GPU
        accuracy = tf.reduce_mean(correct, name='accuracy')
        fg_label_pred = tf.argmax(tf.gather(label_logits, fg_inds), axis=1)
-        num_zero = tf.reduce_sum(tf.to_int64(tf.equal(fg_label_pred, 0)), name='num_zero')
+        num_zero = tf.reduce_sum(tf.cast(tf.equal(fg_label_pred, 0), tf.int64), name='num_zero')
        false_negative = tf.where(
-            empty_fg, 0., tf.to_float(tf.truediv(num_zero, num_fg)), name='false_negative')
+            empty_fg, 0., tf.cast(tf.truediv(num_zero, num_fg), tf.float32), name='false_negative')
        fg_accuracy = tf.where(
            empty_fg, 0., tf.reduce_mean(tf.gather(correct, fg_inds)), name='fg_accuracy')
    box_loss = tf.losses.huber_loss(
        fg_boxes, fg_box_logits, reduction=tf.losses.Reduction.SUM)
    box_loss = tf.truediv(
-        box_loss, tf.to_float(tf.shape(labels)[0]), name='box_loss')
+        box_loss, tf.cast(tf.shape(labels)[0], tf.float32), name='box_loss')
    add_moving_summary(label_loss, box_loss, accuracy,
-                       fg_accuracy, false_negative, tf.to_float(num_fg, name='num_fg_label'))
+                       fg_accuracy, false_negative, tf.cast(num_fg, tf.float32, name='num_fg_label'))
    return [label_loss, box_loss]
@@ -285,7 +285,8 @@ def fastrcnn_Xconv1fc_head(feature, num_convs, norm=None):
    l = feature
    with argscope(Conv2D, data_format='channels_first',
                  kernel_initializer=tf.variance_scaling_initializer(
-                      scale=2.0, mode='fan_out', distribution='normal')):
+                      scale=2.0, mode='fan_out',
+                      distribution='untruncated_normal' if get_tf_version_tuple() >= (1, 12) else 'normal')):
        for k in range(num_convs):
            l = Conv2D('conv{}'.format(k), l, cfg.FPN.FRCNN_CONV_HEAD_DIM, 3, activation=tf.nn.relu)
            if norm is not None:

--- a/examples/FasterRCNN/model_mrcnn.py
+++ b/examples/FasterRCNN/model_mrcnn.py
@@ -2,11 +2,12 @@
 import tensorflow as tf
-from tensorpack.tfutils.argscope import argscope
 from tensorpack.models import (
    Conv2D, layer_register, Conv2DTranspose)
 from tensorpack.tfutils.scope_utils import under_name_scope
+from tensorpack.tfutils.argscope import argscope
 from tensorpack.tfutils.summary import add_moving_summary
+from tensorpack.tfutils.common import get_tf_version_tuple
 from basemodel import GroupNorm
 from config import config as cfg
@@ -39,13 +40,13 @@ def maskrcnn_loss(mask_logits, fg_labels, fg_target_masks):
    pred_label = mask_probs > 0.5
    truth_label = fg_target_masks > 0.5
    accuracy = tf.reduce_mean(
-        tf.to_float(tf.equal(pred_label, truth_label)),
+        tf.cast(tf.equal(pred_label, truth_label), tf.float32),
        name='accuracy')
    pos_accuracy = tf.logical_and(
        tf.equal(pred_label, truth_label),
        tf.equal(truth_label, True))
-    pos_accuracy = tf.reduce_mean(tf.to_float(pos_accuracy), name='pos_accuracy')
+    pos_accuracy = tf.reduce_mean(tf.cast(pos_accuracy, tf.float32), name='pos_accuracy')
-    fg_pixel_ratio = tf.reduce_mean(tf.to_float(truth_label), name='fg_pixel_ratio')
+    fg_pixel_ratio = tf.reduce_mean(tf.cast(truth_label, tf.float32), name='fg_pixel_ratio')
    add_moving_summary(loss, accuracy, fg_pixel_ratio, pos_accuracy)
    return loss
@@ -67,7 +68,8 @@ def maskrcnn_upXconv_head(feature, num_category, num_convs, norm=None):
    l = feature
    with argscope([Conv2D, Conv2DTranspose], data_format='channels_first',
                  kernel_initializer=tf.variance_scaling_initializer(
-                      scale=2.0, mode='fan_out', distribution='normal')):
+                      scale=2.0, mode='fan_out',
+                      distribution='untruncated_normal' if get_tf_version_tuple() >= (1, 12) else 'normal')):
        # c2's MSRAFill is fan_out
        for k in range(num_convs):
            l = Conv2D('fcn{}'.format(k), l, cfg.MRCNN.HEAD_DIM, 3, activation=tf.nn.relu)

--- a/examples/FasterRCNN/model_rpn.py
+++ b/examples/FasterRCNN/model_rpn.py
@@ -72,9 +72,9 @@ def rpn_losses(anchor_labels, anchor_boxes, label_logits, box_logits):
                        tf.equal(valid_prediction, valid_anchor_labels)),
                    dtype=tf.int32)
                placeholder = 0.5   # A small value will make summaries appear lower.
-                recall = tf.to_float(tf.truediv(pos_prediction_corr, nr_pos))
+                recall = tf.cast(tf.truediv(pos_prediction_corr, nr_pos), tf.float32)
                recall = tf.where(tf.equal(nr_pos, 0), placeholder, recall, name='recall_th{}'.format(th))
-                precision = tf.to_float(tf.truediv(pos_prediction_corr, nr_pos_prediction))
+                precision = tf.cast(tf.truediv(pos_prediction_corr, nr_pos_prediction), tf.float32)
                precision = tf.where(tf.equal(nr_pos_prediction, 0),
                                     placeholder, precision, name='precision_th{}'.format(th))
                summaries.extend([precision, recall])
@@ -84,7 +84,7 @@ def rpn_losses(anchor_labels, anchor_boxes, label_logits, box_logits):
    # But the total RPN loss will be fine.  TODO make the summary op smarter
    placeholder = 0.
    label_loss = tf.nn.sigmoid_cross_entropy_with_logits(
-        labels=tf.to_float(valid_anchor_labels), logits=valid_label_logits)
+        labels=tf.cast(valid_anchor_labels, tf.float32), logits=valid_label_logits)
    label_loss = tf.reduce_sum(label_loss) * (1. / cfg.RPN.BATCH_PER_IM)
    label_loss = tf.where(tf.equal(nr_valid, 0), placeholder, label_loss, name='label_loss')

--- a/examples/FasterRCNN/train.py
+++ b/examples/FasterRCNN/train.py
@@ -205,7 +205,7 @@ class ResNetC4Model(DetectionModel):
                feature_maskrcnn = resnet_conv5(roi_resized, cfg.BACKBONE.RESNET_NUM_BLOCKS[-1])
                mask_logits = maskrcnn_upXconv_head(
                    'maskrcnn', feature_maskrcnn, cfg.DATA.NUM_CATEGORY, 0)   # #result x #cat x 14x14
-                indices = tf.stack([tf.range(tf.size(final_labels)), tf.to_int32(final_labels) - 1], axis=1)
+                indices = tf.stack([tf.range(tf.size(final_labels)), tf.cast(final_labels, tf.int32) - 1], axis=1)
                final_mask_logits = tf.gather_nd(mask_logits, indices)   # #resultx14x14
                tf.sigmoid(final_mask_logits, name='output/masks')
            return []
@@ -330,7 +330,7 @@ class ResNetFPNModel(DetectionModel):
                maskrcnn_head_func = getattr(model_mrcnn, cfg.FPN.MRCNN_HEAD_FUNC)
                mask_logits = maskrcnn_head_func(
                    'maskrcnn', roi_feature_maskrcnn, cfg.DATA.NUM_CATEGORY)   # #fg x #cat x 28 x 28
-                indices = tf.stack([tf.range(tf.size(final_labels)), tf.to_int32(final_labels) - 1], axis=1)
+                indices = tf.stack([tf.range(tf.size(final_labels)), tf.cast(final_labels, tf.int32) - 1], axis=1)
                final_mask_logits = tf.gather_nd(mask_logits, indices)   # #resultx28x28
                tf.sigmoid(final_mask_logits, name='output/masks')
            return []

--- a/examples/OpticalFlow/flownet_models.py
+++ b/examples/OpticalFlow/flownet_models.py
@@ -81,7 +81,7 @@ def resample(img, flow):
    img_flat = tf.reshape(tf.transpose(img, [0, 2, 3, 1]), [-1, c])
    dx, dy = tf.unstack(flow, axis=1)
-    xf, yf = tf.meshgrid(tf.to_float(tf.range(w)), tf.to_float(tf.range(h)))
+    xf, yf = tf.meshgrid(tf.cast(tf.range(w), tf.float32), tf.cast(tf.range(h), tf.float32))
    xf = xf + dx
    yf = yf + dy

--- a/examples/ResNet/cifar10-preact18-mixup.py
+++ b/examples/ResNet/cifar10-preact18-mixup.py
@@ -67,8 +67,8 @@ class ResNet_Cifar(ModelDesc):
        ce_cost = tf.nn.softmax_cross_entropy_with_logits(labels=label, logits=logits)
        ce_cost = tf.reduce_mean(ce_cost, name='cross_entropy_loss')
-        single_label = tf.to_int32(tf.argmax(label, axis=1))
+        single_label = tf.cast(tf.argmax(label, axis=1), tf.int32)
-        wrong = tf.to_float(tf.logical_not(tf.nn.in_top_k(logits, single_label, 1)), name='wrong_vector')
+        wrong = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, single_label, 1)), tf.float32, name='wrong_vector')
        # monitor training error
        add_moving_summary(tf.reduce_mean(wrong, name='train_error'), ce_cost)
        add_param_summary(('.*/W', ['histogram']))

--- a/examples/ResNet/cifar10-resnet.py
+++ b/examples/ResNet/cifar10-resnet.py
@@ -98,7 +98,7 @@ class Model(ModelDesc):
        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
-        wrong = tf.to_float(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), name='wrong_vector')
+        wrong = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), tf.float32, name='wrong_vector')
        # monitor training error
        add_moving_summary(tf.reduce_mean(wrong, name='train_error'))

--- a/examples/SpatialTransformer/mnist-addition.py
+++ b/examples/SpatialTransformer/mnist-addition.py
@@ -35,7 +35,7 @@ def sample(img, coords):
    max_coor = tf.constant([shape[0] - 1, shape[1] - 1], dtype=tf.float32)
    coords = tf.clip_by_value(coords, 0., max_coor)  # borderMode==repeat
-    coords = tf.to_int32(coords)
+    coords = tf.cast(coords, tf.int32)
    batch_index = tf.range(batch, dtype=tf.int32)
    batch_index = tf.reshape(batch_index, [-1, 1, 1, 1])
@@ -164,7 +164,7 @@ class Model(ModelDesc):
        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
-        wrong = tf.to_float(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), name='incorrect_vector')
+        wrong = tf.cast(tf.logical_not(tf.nn.in_top_k(logits, label, 1)), tf.float32, name='incorrect_vector')
        summary.add_moving_summary(tf.reduce_mean(wrong, name='train_error'))
        wd_cost = tf.multiply(1e-5, regularize_cost('fc.*/W', tf.nn.l2_loss),

--- a/examples/basics/cifar-convnet.py
+++ b/examples/basics/cifar-convnet.py
@@ -63,7 +63,7 @@ class Model(ModelDesc):
        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
-        correct = tf.to_float(tf.nn.in_top_k(logits, label, 1), name='correct')
+        correct = tf.cast(tf.nn.in_top_k(logits, label, 1), tf.float32, name='correct')
        # monitor training error
        add_moving_summary(tf.reduce_mean(correct, name='accuracy'))

--- a/examples/basics/mnist-tfslim.py
+++ b/examples/basics/mnist-tfslim.py
@@ -46,7 +46,7 @@ class Model(ModelDesc):
        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
-        acc = tf.to_float(tf.nn.in_top_k(logits, label, 1))
+        acc = tf.cast(tf.nn.in_top_k(logits, label, 1), tf.float32)
        acc = tf.reduce_mean(acc, name='accuracy')
        summary.add_moving_summary(acc)

--- a/examples/basics/mnist-visualizations.py
+++ b/examples/basics/mnist-visualizations.py
@@ -97,7 +97,7 @@ class Model(ModelDesc):
        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')
-        tf.reduce_mean(tf.to_float(tf.nn.in_top_k(logits, label, 1)), name='accuracy')
+        tf.reduce_mean(tf.cast(tf.nn.in_top_k(logits, label, 1)), tf.float32, name='accuracy')
        wd_cost = tf.multiply(1e-5,
                              regularize_cost('fc.*/W', tf.nn.l2_loss),

--- a/examples/basics/svhn-digit-convnet.py
+++ b/examples/basics/svhn-digit-convnet.py
@@ -43,7 +43,7 @@ class Model(ModelDesc):
                      .FullyConnected('linear', units=10)())
        tf.nn.softmax(logits, name='output')
-        accuracy = tf.to_float(tf.nn.in_top_k(logits, label, 1))
+        accuracy = tf.cast(tf.nn.in_top_k(logits, label, 1), tf.float32)
        add_moving_summary(tf.reduce_mean(accuracy, name='accuracy'))
        cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label)

--- a/examples/keras/imagenet-resnet-keras.py
+++ b/examples/keras/imagenet-resnet-keras.py
@@ -16,6 +16,7 @@ from tensorpack.contrib.keras import KerasModel
 from tensorpack.callbacks import *
 from tensorflow.python.keras.layers import *
+from tensorpack.tfutils.common import get_tf_version_tuple
 from imagenet_utils import get_imagenet_dataflow, fbresnet_augmentor
@@ -34,7 +35,8 @@ def conv(x, filters, kernel, strides=1, name=None):
    return Conv2D(filters, kernel, name=name,
                  strides=strides, use_bias=False, padding='same',
                  kernel_initializer=tf.keras.initializers.VarianceScaling(
-                      scale=2.0, mode='fan_out', distribution='normal'),
+                      scale=2.0, mode='fan_out',
+                      distribution='untruncated_normal' if get_tf_version_tuple() >= (1, 12) else 'normal'),
                  kernel_regularizer=tf.keras.regularizers.l2(5e-5))(x)

--- a/examples/keras/mnist-keras.py
+++ b/examples/keras/mnist-keras.py
@@ -54,7 +54,7 @@ class Model(ModelDesc):
        cost = tf.reduce_mean(cost, name='cross_entropy_loss')  # the average cross-entropy loss
        # for tensorpack validation
-        acc = tf.to_float(tf.nn.in_top_k(logits, label, 1))
+        acc = tf.cast(tf.nn.in_top_k(logits, label, 1), tf.float32)
        acc = tf.reduce_mean(acc, name='accuracy')
        summary.add_moving_summary(acc)

--- a/tensorpack/callbacks/steps.py
+++ b/tensorpack/callbacks/steps.py
@@ -115,10 +115,9 @@ class MaintainStepCounter(Callback):
        # ensure it exists
        gs_var = get_global_step_var()
        with tf.name_scope(None):
-            with self.graph.colocate_with(gs_var):
+            self.gs_incr_op = tf.assign_add(
-                self.gs_incr_op = tf.assign_add(
+                gs_var, 1,
-                    gs_var, 1,
+                name=GLOBAL_STEP_INCR_OP_NAME).op
-                    name=GLOBAL_STEP_INCR_OP_NAME).op
        self._fetches = tf.train.SessionRunArgs(self.gs_incr_op)
    def _before_train(self):

--- a/tensorpack/models/conv2d.py
+++ b/tensorpack/models/conv2d.py
@@ -52,7 +52,7 @@ def Conv2D(
        if get_tf_version_tuple() <= (1, 12):
            kernel_initializer = tf.contrib.layers.variance_scaling_initializer(2.0),
        else:
-            kernel_initializer = tf.keras.initializers.VarianceScaling(2.0)
+            kernel_initializer = tf.keras.initializers.VarianceScaling(2.0, distribution='untruncated_normal')
    if split == 1:
        with rename_get_variable({'kernel': 'W', 'bias': 'b'}):
            layer = tf.layers.Conv2D(
@@ -160,7 +160,7 @@ def Conv2DTranspose(
        if get_tf_version_tuple() <= (1, 12):
            kernel_initializer = tf.contrib.layers.variance_scaling_initializer(2.0),
        else:
-            kernel_initializer = tf.keras.initializers.VarianceScaling(2.0)
+            kernel_initializer = tf.keras.initializers.VarianceScaling(2.0, distribution='untruncated_normal')
    with rename_get_variable({'kernel': 'W', 'bias': 'b'}):
        layer = tf.layers.Conv2DTranspose(

--- a/tensorpack/models/fc.py
+++ b/tensorpack/models/fc.py
@@ -50,7 +50,7 @@ def FullyConnected(
        if get_tf_version_tuple() <= (1, 12):
            kernel_initializer = tf.contrib.layers.variance_scaling_initializer(2.0),
        else:
-            kernel_initializer = tf.keras.initializers.VarianceScaling(2.0)
+            kernel_initializer = tf.keras.initializers.VarianceScaling(2.0, distribution='untruncated_normal')
    inputs = batch_flatten(inputs)
    with rename_get_variable({'kernel': 'W', 'bias': 'b'}):

--- a/tensorpack/tfutils/optimizer.py
+++ b/tensorpack/tfutils/optimizer.py
@@ -6,6 +6,7 @@ import tensorflow as tf
 from contextlib import contextmanager
 from ..utils.develop import HIDE_DOC
+from ..tfutils.common import get_tf_version_tuple
 from .gradproc import FilterNoneGrad, GradientProcessor
 __all__ = ['apply_grad_processors', 'ProxyOptimizer',
@@ -85,7 +86,7 @@ class PostProcessOptimizer(ProxyOptimizer):
            opt (tf.train.Optimizer):
            func (tf.Variable -> tf.Operation or None): the operation needed
                to perform for this variable after the gradient update.
-            colocate (boolean): colocate the function with the variable.
+            colocate (boolean): colocate the function with the variable. No effect since TF 1.13.
        """
        super(PostProcessOptimizer, self).__init__(opt)
        self._func = func
@@ -109,7 +110,7 @@ class PostProcessOptimizer(ProxyOptimizer):
    @contextmanager
    def _maybe_colocate(self, var):
        G = tf.get_default_graph()
-        if self._colocate:
+        if self._colocate and get_tf_version_tuple() <= (1, 12):
            with G.colocate_with(var):
                yield
        else: