Commit 6ecaab67 authored by Yuxin Wu's avatar Yuxin Wu

add a bunch of scope & names for debugging

parent a9a3b7d1
...@@ -48,6 +48,7 @@ class Callback(object): ...@@ -48,6 +48,7 @@ class Callback(object):
self.graph = tf.get_default_graph() self.graph = tf.get_default_graph()
self.epoch_num = self.trainer.config.starting_epoch - 1 self.epoch_num = self.trainer.config.starting_epoch - 1
# self.epoch_num is always the number of epochs that finished updating parameters. # self.epoch_num is always the number of epochs that finished updating parameters.
with tf.name_scope(type(self).__name__):
self._setup_graph() self._setup_graph()
def _setup_graph(self): def _setup_graph(self):
......
...@@ -22,6 +22,7 @@ class GradientProcessor(object): ...@@ -22,6 +22,7 @@ class GradientProcessor(object):
:param grads: list of (grad, var) :param grads: list of (grad, var)
:returns: symbolic gradients with the same type as input :returns: symbolic gradients with the same type as input
""" """
with tf.name_scope(type(self).__name__):
return self._process(grads) return self._process(grads)
@abstractmethod @abstractmethod
......
...@@ -32,6 +32,7 @@ def add_activation_summary(x, name=None): ...@@ -32,6 +32,7 @@ def add_activation_summary(x, name=None):
"Summary a scalar with histogram? Maybe use scalar instead. FIXME!" "Summary a scalar with histogram? Maybe use scalar instead. FIXME!"
if name is None: if name is None:
name = x.name name = x.name
with tf.name_scope('act_summary'):
tf.histogram_summary(name + '/activation', x) tf.histogram_summary(name + '/activation', x)
tf.scalar_summary(name + '/activation_sparsity', tf.nn.zero_fraction(x)) tf.scalar_summary(name + '/activation_sparsity', tf.nn.zero_fraction(x))
tf.scalar_summary( tf.scalar_summary(
...@@ -70,6 +71,7 @@ def add_param_summary(summary_lists): ...@@ -70,6 +71,7 @@ def add_param_summary(summary_lists):
import re import re
params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
with tf.name_scope('param_summary'):
for p in params: for p in params:
name = p.name name = p.name
for rgx, actions in summary_lists: for rgx, actions in summary_lists:
...@@ -94,9 +96,11 @@ def summary_moving_average(): ...@@ -94,9 +96,11 @@ def summary_moving_average():
MOVING_SUMMARY_VARS_KEY. MOVING_SUMMARY_VARS_KEY.
:returns: a op to maintain these average. :returns: a op to maintain these average.
""" """
with tf.name_scope('EMA_summary'):
global_step_var = get_global_step_var() global_step_var = get_global_step_var()
with tf.name_scope(None):
averager = tf.train.ExponentialMovingAverage( averager = tf.train.ExponentialMovingAverage(
0.99, num_updates=global_step_var, name='moving_averages') 0.99, num_updates=global_step_var, name='EMA')
vars_to_summary = tf.get_collection(MOVING_SUMMARY_VARS_KEY) vars_to_summary = tf.get_collection(MOVING_SUMMARY_VARS_KEY)
avg_maintain_op = averager.apply(vars_to_summary) avg_maintain_op = averager.apply(vars_to_summary)
for idx, c in enumerate(vars_to_summary): for idx, c in enumerate(vars_to_summary):
......
...@@ -25,6 +25,7 @@ class MultiGPUTrainer(QueueInputTrainer): ...@@ -25,6 +25,7 @@ class MultiGPUTrainer(QueueInputTrainer):
@staticmethod @staticmethod
def _average_grads(tower_grads): def _average_grads(tower_grads):
ret = [] ret = []
with tf.name_scope('average_grad'):
for grad_and_vars in zip(*tower_grads): for grad_and_vars in zip(*tower_grads):
v = grad_and_vars[0][1] v = grad_and_vars[0][1]
try: try:
...@@ -73,7 +74,7 @@ class SyncMultiGPUTrainer(MultiGPUTrainer): ...@@ -73,7 +74,7 @@ class SyncMultiGPUTrainer(MultiGPUTrainer):
self.train_op = tf.group( self.train_op = tf.group(
self.config.optimizer.apply_gradients(grads, get_global_step_var()), self.config.optimizer.apply_gradients(grads, get_global_step_var()),
summary_moving_average()) summary_moving_average(), name='train_op')
describe_model() describe_model()
with freeze_collection(self.SUMMARY_BACKUP_KEYS): with freeze_collection(self.SUMMARY_BACKUP_KEYS):
...@@ -92,6 +93,7 @@ class AsyncMultiGPUTrainer(MultiGPUTrainer): ...@@ -92,6 +93,7 @@ class AsyncMultiGPUTrainer(MultiGPUTrainer):
# pretend to average the grads, in order to make async and # pretend to average the grads, in order to make async and
# sync have consistent effective learning rate # sync have consistent effective learning rate
def scale(grads): def scale(grads):
with tf.name_scope('async_scale_grad'):
return [(grad / self.config.nr_tower, var) for grad, var in grads] return [(grad / self.config.nr_tower, var) for grad, var in grads]
grad_list = map(scale, grad_list) grad_list = map(scale, grad_list)
grad_list = [self.process_grads(g) for g in grad_list] grad_list = [self.process_grads(g) for g in grad_list]
...@@ -99,7 +101,7 @@ class AsyncMultiGPUTrainer(MultiGPUTrainer): ...@@ -99,7 +101,7 @@ class AsyncMultiGPUTrainer(MultiGPUTrainer):
# use grad from the first tower for iteration in main thread # use grad from the first tower for iteration in main thread
self.train_op = tf.group( self.train_op = tf.group(
self.config.optimizer.apply_gradients(grad_list[0], get_global_step_var()), self.config.optimizer.apply_gradients(grad_list[0], get_global_step_var()),
summary_moving_average()) summary_moving_average(), name='train_op')
describe_model() describe_model()
# prepare train_op for the rest of the towers # prepare train_op for the rest of the towers
......
...@@ -175,13 +175,23 @@ class QueueInputTrainer(Trainer): ...@@ -175,13 +175,23 @@ class QueueInputTrainer(Trainer):
self.train_op = tf.group( self.train_op = tf.group(
self.config.optimizer.apply_gradients(grads, get_global_step_var()), self.config.optimizer.apply_gradients(grads, get_global_step_var()),
summary_moving_average()) summary_moving_average(), 'train_op')
self.main_loop() self.main_loop()
def run_step(self): def run_step(self):
""" just run self.train_op""" """ just run self.train_op"""
self.sess.run([self.train_op]) self.sess.run([self.train_op])
#run_metadata = tf.RunMetadata()
#self.sess.run([self.train_op],
#options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
#run_metadata=run_metadata
#)
#from tensorflow.python.client import timeline
#trace = timeline.Timeline(step_stats=run_metadata.step_stats)
#trace_file = open('timeline.ctf.json', 'w')
#trace_file.write(trace.generate_chrome_trace_format())
#import sys; sys.exit()
def _trigger_epoch(self): def _trigger_epoch(self):
# need to run summary_op every epoch # need to run summary_op every epoch
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment