improve logging in complicated variable management

a3674b47 · Yuxin Wu · 399db3ae · a3674b47 · a3674b47 · a3674b47
Commit a3674b47 authored Jun 03, 2017 by Yuxin Wu
4 changed files
--- a/tensorpack/models/batch_norm.py
+++ b/tensorpack/models/batch_norm.py
@@ -43,10 +43,10 @@ def update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay):
    update_op2 = moving_averages.assign_moving_average(
        moving_var, batch_var, decay, zero_debias=False,
        name='var_ema_op')
+    # Only add model var when we update them
    add_model_variable(moving_mean)
    add_model_variable(moving_var)

-    # seems faster than delayed update, but might behave otherwise in distributed settings.
    # TODO add an option, and maybe enable it for replica mode?
    # with tf.control_dependencies([update_op1, update_op2]):
    # return tf.identity(xn, name='output')
@@ -143,8 +143,8 @@ def BatchNorm(x, use_local_stat=None, decay=0.9, epsilon=1e-5,
            xn = tf.nn.batch_normalization(
                x, moving_mean, moving_var, beta, gamma, epsilon)

-    # maintain EMA only on one GPU.
-    if ctx.is_main_training_tower or ctx.has_own_variables:
+    # maintain EMA only on one GPU is OK.
+    if ctx.is_main_training_tower:
        ret = update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay)
    else:
        ret = tf.identity(xn, name='output')
@@ -231,6 +231,7 @@ def BatchRenorm(x, rmax, dmax, decay=0.9, epsilon=1e-5,
            xn = tf.nn.batch_normalization(
                x, moving_mean, moving_var, beta, gamma, epsilon)

+    # training also needs EMA, so ideally we should maintain it on every tower
    if ctx.is_main_training_tower or ctx.has_own_variables:
        ret = update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay)
    else:

--- a/tensorpack/models/common.py
+++ b/tensorpack/models/common.py
@@ -5,6 +5,7 @@
 import tensorflow as tf
 from functools import wraps
 import six
+import re
 import copy

 from ..tfutils.argscope import get_arg_scope
@@ -123,7 +124,8 @@ def layer_register(

            if name is not None:        # use scope
                with tf.variable_scope(name) as scope:
-                    do_log_shape = log_shape and scope.name not in _LAYER_LOGGED
+                    scope_name = re.sub('tower[0-9]+/', '', scope.name)
+                    do_log_shape = log_shape and scope_name not in _LAYER_LOGGED
                    if do_log_shape:
                        logger.info("{} input: {}".format(scope.name, get_shape_str(inputs)))


--- a/tensorpack/models/regularize.py
+++ b/tensorpack/models/regularize.py
@@ -24,7 +24,8 @@ l1_regularizer = tf.contrib.layers.l1_regularizer

 def regularize_cost(regex, func, name='regularize_cost'):
    """
-    Apply a regularizer on every trainable variable matching the regex.
+    Apply a regularizer on trainable variables matching the regex.
+    In replicated mode, will only regularize variables within the current tower.

    Args:
        regex (str): a regex to match variable names, e.g. "conv.*/W"

--- a/tensorpack/tfutils/model_utils.py
+++ b/tensorpack/tfutils/model_utils.py
@@ -14,19 +14,34 @@ __all__ = ['describe_model', 'get_shape_str', 'apply_slim_collections']


 def describe_model():
-    """ Print a description of the current model parameters """
+    """
+    Print a description of the current model parameters.
+    Skip variables starting with "tower".
+    """
    train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
    if len(train_vars) == 0:
-        logger.info("No trainable variables in the graph!")
+        logger.warn("No trainable variables in the graph!")
        return
    total = 0
    data = []
+    devices = set()
    for v in train_vars:
+        if v.name.startswith('tower'):
+            continue
        shape = v.get_shape()
        ele = shape.num_elements()
        total += ele
-        data.append([v.name, shape.as_list(), ele])
-    table = tabulate(data, headers=['name', 'shape', 'dim'])
+        devices.add(v.device)
+        data.append([v.name, shape.as_list(), ele, v.device])
+
+    if len(devices) == 1:
+        # don't log the device if all vars on the same device
+        for d in data:
+            d.pop()
+        table = tabulate(data, headers=['name', 'shape', 'dim'])
+    else:
+        table = tabulate(data, headers=['name', 'shape', 'dim', 'device'])
+
    size_mb = total * 4 / 1024.0**2
    summary_msg = colored(
        "\nTotal #vars={}, #param={} ({:.02f} MB assuming all float32)".format(