Commit a3674b47 authored by Yuxin Wu's avatar Yuxin Wu

improve logging in complicated variable management

parent 399db3ae
......@@ -43,10 +43,10 @@ def update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay):
update_op2 = moving_averages.assign_moving_average(
moving_var, batch_var, decay, zero_debias=False,
name='var_ema_op')
# Only add model var when we update them
add_model_variable(moving_mean)
add_model_variable(moving_var)
# seems faster than delayed update, but might behave otherwise in distributed settings.
# TODO add an option, and maybe enable it for replica mode?
# with tf.control_dependencies([update_op1, update_op2]):
# return tf.identity(xn, name='output')
......@@ -143,8 +143,8 @@ def BatchNorm(x, use_local_stat=None, decay=0.9, epsilon=1e-5,
xn = tf.nn.batch_normalization(
x, moving_mean, moving_var, beta, gamma, epsilon)
# maintain EMA only on one GPU.
if ctx.is_main_training_tower or ctx.has_own_variables:
# maintain EMA only on one GPU is OK.
if ctx.is_main_training_tower:
ret = update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay)
else:
ret = tf.identity(xn, name='output')
......@@ -231,6 +231,7 @@ def BatchRenorm(x, rmax, dmax, decay=0.9, epsilon=1e-5,
xn = tf.nn.batch_normalization(
x, moving_mean, moving_var, beta, gamma, epsilon)
# training also needs EMA, so ideally we should maintain it on every tower
if ctx.is_main_training_tower or ctx.has_own_variables:
ret = update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay)
else:
......
......@@ -5,6 +5,7 @@
import tensorflow as tf
from functools import wraps
import six
import re
import copy
from ..tfutils.argscope import get_arg_scope
......@@ -123,7 +124,8 @@ def layer_register(
if name is not None: # use scope
with tf.variable_scope(name) as scope:
do_log_shape = log_shape and scope.name not in _LAYER_LOGGED
scope_name = re.sub('tower[0-9]+/', '', scope.name)
do_log_shape = log_shape and scope_name not in _LAYER_LOGGED
if do_log_shape:
logger.info("{} input: {}".format(scope.name, get_shape_str(inputs)))
......
......@@ -24,7 +24,8 @@ l1_regularizer = tf.contrib.layers.l1_regularizer
def regularize_cost(regex, func, name='regularize_cost'):
"""
Apply a regularizer on every trainable variable matching the regex.
Apply a regularizer on trainable variables matching the regex.
In replicated mode, will only regularize variables within the current tower.
Args:
regex (str): a regex to match variable names, e.g. "conv.*/W"
......
......@@ -14,19 +14,34 @@ __all__ = ['describe_model', 'get_shape_str', 'apply_slim_collections']
def describe_model():
""" Print a description of the current model parameters """
"""
Print a description of the current model parameters.
Skip variables starting with "tower".
"""
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
if len(train_vars) == 0:
logger.info("No trainable variables in the graph!")
logger.warn("No trainable variables in the graph!")
return
total = 0
data = []
devices = set()
for v in train_vars:
if v.name.startswith('tower'):
continue
shape = v.get_shape()
ele = shape.num_elements()
total += ele
data.append([v.name, shape.as_list(), ele])
table = tabulate(data, headers=['name', 'shape', 'dim'])
devices.add(v.device)
data.append([v.name, shape.as_list(), ele, v.device])
if len(devices) == 1:
# don't log the device if all vars on the same device
for d in data:
d.pop()
table = tabulate(data, headers=['name', 'shape', 'dim'])
else:
table = tabulate(data, headers=['name', 'shape', 'dim', 'device'])
size_mb = total * 4 / 1024.0**2
summary_msg = colored(
"\nTotal #vars={}, #param={} ({:.02f} MB assuming all float32)".format(
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment