Commit 6ecaab67 authored by Yuxin Wu's avatar Yuxin Wu

add a bunch of scope & names for debugging

parent a9a3b7d1
......@@ -48,6 +48,7 @@ class Callback(object):
self.graph = tf.get_default_graph()
self.epoch_num = self.trainer.config.starting_epoch - 1
# self.epoch_num is always the number of epochs that finished updating parameters.
with tf.name_scope(type(self).__name__):
self._setup_graph()
def _setup_graph(self):
......
......@@ -22,6 +22,7 @@ class GradientProcessor(object):
:param grads: list of (grad, var)
:returns: symbolic gradients with the same type as input
"""
with tf.name_scope(type(self).__name__):
return self._process(grads)
@abstractmethod
......
......@@ -32,6 +32,7 @@ def add_activation_summary(x, name=None):
"Summary a scalar with histogram? Maybe use scalar instead. FIXME!"
if name is None:
name = x.name
with tf.name_scope('act_summary'):
tf.histogram_summary(name + '/activation', x)
tf.scalar_summary(name + '/activation_sparsity', tf.nn.zero_fraction(x))
tf.scalar_summary(
......@@ -70,6 +71,7 @@ def add_param_summary(summary_lists):
import re
params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
with tf.name_scope('param_summary'):
for p in params:
name = p.name
for rgx, actions in summary_lists:
......@@ -94,9 +96,11 @@ def summary_moving_average():
MOVING_SUMMARY_VARS_KEY.
:returns: a op to maintain these average.
"""
with tf.name_scope('EMA_summary'):
global_step_var = get_global_step_var()
with tf.name_scope(None):
averager = tf.train.ExponentialMovingAverage(
0.99, num_updates=global_step_var, name='moving_averages')
0.99, num_updates=global_step_var, name='EMA')
vars_to_summary = tf.get_collection(MOVING_SUMMARY_VARS_KEY)
avg_maintain_op = averager.apply(vars_to_summary)
for idx, c in enumerate(vars_to_summary):
......
......@@ -25,6 +25,7 @@ class MultiGPUTrainer(QueueInputTrainer):
@staticmethod
def _average_grads(tower_grads):
ret = []
with tf.name_scope('average_grad'):
for grad_and_vars in zip(*tower_grads):
v = grad_and_vars[0][1]
try:
......@@ -73,7 +74,7 @@ class SyncMultiGPUTrainer(MultiGPUTrainer):
self.train_op = tf.group(
self.config.optimizer.apply_gradients(grads, get_global_step_var()),
summary_moving_average())
summary_moving_average(), name='train_op')
describe_model()
with freeze_collection(self.SUMMARY_BACKUP_KEYS):
......@@ -92,6 +93,7 @@ class AsyncMultiGPUTrainer(MultiGPUTrainer):
# pretend to average the grads, in order to make async and
# sync have consistent effective learning rate
def scale(grads):
with tf.name_scope('async_scale_grad'):
return [(grad / self.config.nr_tower, var) for grad, var in grads]
grad_list = map(scale, grad_list)
grad_list = [self.process_grads(g) for g in grad_list]
......@@ -99,7 +101,7 @@ class AsyncMultiGPUTrainer(MultiGPUTrainer):
# use grad from the first tower for iteration in main thread
self.train_op = tf.group(
self.config.optimizer.apply_gradients(grad_list[0], get_global_step_var()),
summary_moving_average())
summary_moving_average(), name='train_op')
describe_model()
# prepare train_op for the rest of the towers
......
......@@ -175,13 +175,23 @@ class QueueInputTrainer(Trainer):
self.train_op = tf.group(
self.config.optimizer.apply_gradients(grads, get_global_step_var()),
summary_moving_average())
summary_moving_average(), 'train_op')
self.main_loop()
def run_step(self):
""" just run self.train_op"""
self.sess.run([self.train_op])
#run_metadata = tf.RunMetadata()
#self.sess.run([self.train_op],
#options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
#run_metadata=run_metadata
#)
#from tensorflow.python.client import timeline
#trace = timeline.Timeline(step_stats=run_metadata.step_stats)
#trace_file = open('timeline.ctf.json', 'w')
#trace_file.write(trace.generate_chrome_trace_format())
#import sys; sys.exit()
def _trigger_epoch(self):
# need to run summary_op every epoch
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment