Commit 6ecaab67 authored by Yuxin Wu's avatar Yuxin Wu

add a bunch of scope & names for debugging

parent a9a3b7d1
...@@ -48,7 +48,8 @@ class Callback(object): ...@@ -48,7 +48,8 @@ class Callback(object):
self.graph = tf.get_default_graph() self.graph = tf.get_default_graph()
self.epoch_num = self.trainer.config.starting_epoch - 1 self.epoch_num = self.trainer.config.starting_epoch - 1
# self.epoch_num is always the number of epochs that finished updating parameters. # self.epoch_num is always the number of epochs that finished updating parameters.
self._setup_graph() with tf.name_scope(type(self).__name__):
self._setup_graph()
def _setup_graph(self): def _setup_graph(self):
pass pass
......
...@@ -22,7 +22,8 @@ class GradientProcessor(object): ...@@ -22,7 +22,8 @@ class GradientProcessor(object):
:param grads: list of (grad, var) :param grads: list of (grad, var)
:returns: symbolic gradients with the same type as input :returns: symbolic gradients with the same type as input
""" """
return self._process(grads) with tf.name_scope(type(self).__name__):
return self._process(grads)
@abstractmethod @abstractmethod
def _process(self, grads): def _process(self, grads):
......
...@@ -32,11 +32,12 @@ def add_activation_summary(x, name=None): ...@@ -32,11 +32,12 @@ def add_activation_summary(x, name=None):
"Summary a scalar with histogram? Maybe use scalar instead. FIXME!" "Summary a scalar with histogram? Maybe use scalar instead. FIXME!"
if name is None: if name is None:
name = x.name name = x.name
tf.histogram_summary(name + '/activation', x) with tf.name_scope('act_summary'):
tf.scalar_summary(name + '/activation_sparsity', tf.nn.zero_fraction(x)) tf.histogram_summary(name + '/activation', x)
tf.scalar_summary( tf.scalar_summary(name + '/activation_sparsity', tf.nn.zero_fraction(x))
name + '/activation_rms', tf.scalar_summary(
tf.sqrt(tf.reduce_mean(tf.square(x)))) name + '/activation_rms',
tf.sqrt(tf.reduce_mean(tf.square(x))))
def add_param_summary(summary_lists): def add_param_summary(summary_lists):
""" """
...@@ -70,14 +71,15 @@ def add_param_summary(summary_lists): ...@@ -70,14 +71,15 @@ def add_param_summary(summary_lists):
import re import re
params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
for p in params: with tf.name_scope('param_summary'):
name = p.name for p in params:
for rgx, actions in summary_lists: name = p.name
if not rgx.endswith('$'): for rgx, actions in summary_lists:
rgx = rgx + '(:0)?$' if not rgx.endswith('$'):
if re.match(rgx, name): rgx = rgx + '(:0)?$'
for act in actions: if re.match(rgx, name):
perform(p, act) for act in actions:
perform(p, act)
def add_moving_summary(v, *args): def add_moving_summary(v, *args):
""" """
...@@ -94,13 +96,15 @@ def summary_moving_average(): ...@@ -94,13 +96,15 @@ def summary_moving_average():
MOVING_SUMMARY_VARS_KEY. MOVING_SUMMARY_VARS_KEY.
:returns: a op to maintain these average. :returns: a op to maintain these average.
""" """
global_step_var = get_global_step_var() with tf.name_scope('EMA_summary'):
averager = tf.train.ExponentialMovingAverage( global_step_var = get_global_step_var()
0.99, num_updates=global_step_var, name='moving_averages') with tf.name_scope(None):
vars_to_summary = tf.get_collection(MOVING_SUMMARY_VARS_KEY) averager = tf.train.ExponentialMovingAverage(
avg_maintain_op = averager.apply(vars_to_summary) 0.99, num_updates=global_step_var, name='EMA')
for idx, c in enumerate(vars_to_summary): vars_to_summary = tf.get_collection(MOVING_SUMMARY_VARS_KEY)
name = re.sub('tower[p0-9]+/', '', c.op.name) avg_maintain_op = averager.apply(vars_to_summary)
tf.scalar_summary(name, averager.average(c)) for idx, c in enumerate(vars_to_summary):
return avg_maintain_op name = re.sub('tower[p0-9]+/', '', c.op.name)
tf.scalar_summary(name, averager.average(c))
return avg_maintain_op
...@@ -25,14 +25,15 @@ class MultiGPUTrainer(QueueInputTrainer): ...@@ -25,14 +25,15 @@ class MultiGPUTrainer(QueueInputTrainer):
@staticmethod @staticmethod
def _average_grads(tower_grads): def _average_grads(tower_grads):
ret = [] ret = []
for grad_and_vars in zip(*tower_grads): with tf.name_scope('average_grad'):
v = grad_and_vars[0][1] for grad_and_vars in zip(*tower_grads):
try: v = grad_and_vars[0][1]
grad = tf.add_n([x[0] for x in grad_and_vars]) / float(len(tower_grads)) try:
except: grad = tf.add_n([x[0] for x in grad_and_vars]) / float(len(tower_grads))
logger.error("Error while processing gradients of {}".format(v.name)) except:
raise logger.error("Error while processing gradients of {}".format(v.name))
ret.append((grad, v)) raise
ret.append((grad, v))
return ret return ret
def _multi_tower_grads(self): def _multi_tower_grads(self):
...@@ -73,7 +74,7 @@ class SyncMultiGPUTrainer(MultiGPUTrainer): ...@@ -73,7 +74,7 @@ class SyncMultiGPUTrainer(MultiGPUTrainer):
self.train_op = tf.group( self.train_op = tf.group(
self.config.optimizer.apply_gradients(grads, get_global_step_var()), self.config.optimizer.apply_gradients(grads, get_global_step_var()),
summary_moving_average()) summary_moving_average(), name='train_op')
describe_model() describe_model()
with freeze_collection(self.SUMMARY_BACKUP_KEYS): with freeze_collection(self.SUMMARY_BACKUP_KEYS):
...@@ -92,14 +93,15 @@ class AsyncMultiGPUTrainer(MultiGPUTrainer): ...@@ -92,14 +93,15 @@ class AsyncMultiGPUTrainer(MultiGPUTrainer):
# pretend to average the grads, in order to make async and # pretend to average the grads, in order to make async and
# sync have consistent effective learning rate # sync have consistent effective learning rate
def scale(grads): def scale(grads):
return [(grad / self.config.nr_tower, var) for grad, var in grads] with tf.name_scope('async_scale_grad'):
return [(grad / self.config.nr_tower, var) for grad, var in grads]
grad_list = map(scale, grad_list) grad_list = map(scale, grad_list)
grad_list = [self.process_grads(g) for g in grad_list] grad_list = [self.process_grads(g) for g in grad_list]
# use grad from the first tower for iteration in main thread # use grad from the first tower for iteration in main thread
self.train_op = tf.group( self.train_op = tf.group(
self.config.optimizer.apply_gradients(grad_list[0], get_global_step_var()), self.config.optimizer.apply_gradients(grad_list[0], get_global_step_var()),
summary_moving_average()) summary_moving_average(), name='train_op')
describe_model() describe_model()
# prepare train_op for the rest of the towers # prepare train_op for the rest of the towers
......
...@@ -175,13 +175,23 @@ class QueueInputTrainer(Trainer): ...@@ -175,13 +175,23 @@ class QueueInputTrainer(Trainer):
self.train_op = tf.group( self.train_op = tf.group(
self.config.optimizer.apply_gradients(grads, get_global_step_var()), self.config.optimizer.apply_gradients(grads, get_global_step_var()),
summary_moving_average()) summary_moving_average(), 'train_op')
self.main_loop() self.main_loop()
def run_step(self): def run_step(self):
""" just run self.train_op""" """ just run self.train_op"""
self.sess.run([self.train_op]) self.sess.run([self.train_op])
#run_metadata = tf.RunMetadata()
#self.sess.run([self.train_op],
#options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
#run_metadata=run_metadata
#)
#from tensorflow.python.client import timeline
#trace = timeline.Timeline(step_stats=run_metadata.step_stats)
#trace_file = open('timeline.ctf.json', 'w')
#trace_file.write(trace.generate_chrome_trace_format())
#import sys; sys.exit()
def _trigger_epoch(self): def _trigger_epoch(self):
# need to run summary_op every epoch # need to run summary_op every epoch
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment