Commit 6ecaab67 authored by Yuxin Wu's avatar Yuxin Wu

add a bunch of scope & names for debugging

parent a9a3b7d1
......@@ -48,7 +48,8 @@ class Callback(object):
self.graph = tf.get_default_graph()
self.epoch_num = self.trainer.config.starting_epoch - 1
# self.epoch_num is always the number of epochs that finished updating parameters.
self._setup_graph()
with tf.name_scope(type(self).__name__):
self._setup_graph()
def _setup_graph(self):
pass
......
......@@ -22,7 +22,8 @@ class GradientProcessor(object):
:param grads: list of (grad, var)
:returns: symbolic gradients with the same type as input
"""
return self._process(grads)
with tf.name_scope(type(self).__name__):
return self._process(grads)
@abstractmethod
def _process(self, grads):
......
......@@ -32,11 +32,12 @@ def add_activation_summary(x, name=None):
"Summary a scalar with histogram? Maybe use scalar instead. FIXME!"
if name is None:
name = x.name
tf.histogram_summary(name + '/activation', x)
tf.scalar_summary(name + '/activation_sparsity', tf.nn.zero_fraction(x))
tf.scalar_summary(
name + '/activation_rms',
tf.sqrt(tf.reduce_mean(tf.square(x))))
with tf.name_scope('act_summary'):
tf.histogram_summary(name + '/activation', x)
tf.scalar_summary(name + '/activation_sparsity', tf.nn.zero_fraction(x))
tf.scalar_summary(
name + '/activation_rms',
tf.sqrt(tf.reduce_mean(tf.square(x))))
def add_param_summary(summary_lists):
"""
......@@ -70,14 +71,15 @@ def add_param_summary(summary_lists):
import re
params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
for p in params:
name = p.name
for rgx, actions in summary_lists:
if not rgx.endswith('$'):
rgx = rgx + '(:0)?$'
if re.match(rgx, name):
for act in actions:
perform(p, act)
with tf.name_scope('param_summary'):
for p in params:
name = p.name
for rgx, actions in summary_lists:
if not rgx.endswith('$'):
rgx = rgx + '(:0)?$'
if re.match(rgx, name):
for act in actions:
perform(p, act)
def add_moving_summary(v, *args):
"""
......@@ -94,13 +96,15 @@ def summary_moving_average():
MOVING_SUMMARY_VARS_KEY.
:returns: a op to maintain these average.
"""
global_step_var = get_global_step_var()
averager = tf.train.ExponentialMovingAverage(
0.99, num_updates=global_step_var, name='moving_averages')
vars_to_summary = tf.get_collection(MOVING_SUMMARY_VARS_KEY)
avg_maintain_op = averager.apply(vars_to_summary)
for idx, c in enumerate(vars_to_summary):
name = re.sub('tower[p0-9]+/', '', c.op.name)
tf.scalar_summary(name, averager.average(c))
return avg_maintain_op
with tf.name_scope('EMA_summary'):
global_step_var = get_global_step_var()
with tf.name_scope(None):
averager = tf.train.ExponentialMovingAverage(
0.99, num_updates=global_step_var, name='EMA')
vars_to_summary = tf.get_collection(MOVING_SUMMARY_VARS_KEY)
avg_maintain_op = averager.apply(vars_to_summary)
for idx, c in enumerate(vars_to_summary):
name = re.sub('tower[p0-9]+/', '', c.op.name)
tf.scalar_summary(name, averager.average(c))
return avg_maintain_op
......@@ -25,14 +25,15 @@ class MultiGPUTrainer(QueueInputTrainer):
@staticmethod
def _average_grads(tower_grads):
ret = []
for grad_and_vars in zip(*tower_grads):
v = grad_and_vars[0][1]
try:
grad = tf.add_n([x[0] for x in grad_and_vars]) / float(len(tower_grads))
except:
logger.error("Error while processing gradients of {}".format(v.name))
raise
ret.append((grad, v))
with tf.name_scope('average_grad'):
for grad_and_vars in zip(*tower_grads):
v = grad_and_vars[0][1]
try:
grad = tf.add_n([x[0] for x in grad_and_vars]) / float(len(tower_grads))
except:
logger.error("Error while processing gradients of {}".format(v.name))
raise
ret.append((grad, v))
return ret
def _multi_tower_grads(self):
......@@ -73,7 +74,7 @@ class SyncMultiGPUTrainer(MultiGPUTrainer):
self.train_op = tf.group(
self.config.optimizer.apply_gradients(grads, get_global_step_var()),
summary_moving_average())
summary_moving_average(), name='train_op')
describe_model()
with freeze_collection(self.SUMMARY_BACKUP_KEYS):
......@@ -92,14 +93,15 @@ class AsyncMultiGPUTrainer(MultiGPUTrainer):
# pretend to average the grads, in order to make async and
# sync have consistent effective learning rate
def scale(grads):
return [(grad / self.config.nr_tower, var) for grad, var in grads]
with tf.name_scope('async_scale_grad'):
return [(grad / self.config.nr_tower, var) for grad, var in grads]
grad_list = map(scale, grad_list)
grad_list = [self.process_grads(g) for g in grad_list]
# use grad from the first tower for iteration in main thread
self.train_op = tf.group(
self.config.optimizer.apply_gradients(grad_list[0], get_global_step_var()),
summary_moving_average())
summary_moving_average(), name='train_op')
describe_model()
# prepare train_op for the rest of the towers
......
......@@ -175,13 +175,23 @@ class QueueInputTrainer(Trainer):
self.train_op = tf.group(
self.config.optimizer.apply_gradients(grads, get_global_step_var()),
summary_moving_average())
summary_moving_average(), 'train_op')
self.main_loop()
def run_step(self):
""" just run self.train_op"""
self.sess.run([self.train_op])
#run_metadata = tf.RunMetadata()
#self.sess.run([self.train_op],
#options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
#run_metadata=run_metadata
#)
#from tensorflow.python.client import timeline
#trace = timeline.Timeline(step_stats=run_metadata.step_stats)
#trace_file = open('timeline.ctf.json', 'w')
#trace_file.write(trace.generate_chrome_trace_format())
#import sys; sys.exit()
def _trigger_epoch(self):
# need to run summary_op every epoch
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment