Commit fa34d239 authored by Yuxin Wu's avatar Yuxin Wu

Make all monitors chief_only=False by default, improve warnings (#791)

parent b96cb78c
......@@ -46,6 +46,9 @@ class TrainingMonitor(Callback):
.. document private functions
.. automethod:: _setup_graph
"""
_chief_only = False
def setup_graph(self, trainer):
self.trainer = trainer
self._setup_graph()
......@@ -92,7 +95,13 @@ class TrainingMonitor(Callback):
class NoOpMonitor(TrainingMonitor):
pass
def __init__(self, name=None):
self._name = name
def __str__(self):
if self._name is None:
return "NoOpMonitor"
return "NoOpMonitor({})".format(self._name)
class Monitors(Callback):
......@@ -221,7 +230,7 @@ class TFEventWriter(TrainingMonitor):
self._flush_secs = flush_secs
self._split_files = split_files
def __new__(cls, logdir=None, max_queue=10, flush_secs=120):
def __new__(cls, logdir=None, max_queue=10, flush_secs=120, **kwargs):
if logdir is None:
logdir = logger.get_logger_dir()
......@@ -229,7 +238,7 @@ class TFEventWriter(TrainingMonitor):
return super(TFEventWriter, cls).__new__(cls)
else:
logger.warn("logger directory was not set. Ignore TFEventWriter.")
return NoOpMonitor()
return NoOpMonitor("TFEventWriter")
def _setup_graph(self):
self._writer = tf.summary.FileWriter(
......@@ -268,7 +277,7 @@ class JSONWriter(TrainingMonitor):
return super(JSONWriter, cls).__new__(cls)
else:
logger.warn("logger directory was not set. Ignore JSONWriter.")
return NoOpMonitor()
return NoOpMonitor("JSONWriter")
@staticmethod
def load_existing_json():
......@@ -370,8 +379,6 @@ class ScalarPrinter(TrainingMonitor):
Print scalar data into terminal.
"""
_chief_only = False
def __init__(self, enable_step=False, enable_epoch=True,
whitelist=None, blacklist=None):
"""
......@@ -439,8 +446,6 @@ class ScalarHistory(TrainingMonitor):
Only used by monitors internally.
"""
_chief_only = False
def _setup_graph(self):
self._dic = defaultdict(list)
......
......@@ -114,7 +114,7 @@ class GPUUtilizationTracker(Callback):
class GraphProfiler(Callback):
"""
Enable profiling by installing session hooks,
and write metadata or tracing files to ``logger.get_logger_dir()``.
and write tracing files / events / metadata to ``logger.get_logger_dir()``.
The tracing files can be loaded from ``chrome://tracing``.
The metadata files can be processed by
......@@ -122,9 +122,16 @@ class GraphProfiler(Callback):
<https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/profiler/README.md>`_.
The event is viewable from tensorboard.
Note that the profiling is enabled for every step.
You probably want to schedule it less frequently by
:class:`PeriodicRunHooks`.
Tips:
Note that the profiling is by default enabled for every step and is expensive.
You probably want to schedule it less frequently, e.g.:
.. code-block:: none
EnableCallbackIf(
GraphProfiler(dump_tracing=True, dump_event=True),
lambda self: self.trainer.global_step > 20 and self.trainer.global_step < 30)
"""
def __init__(self, dump_metadata=False, dump_tracing=True, dump_event=False):
"""
......@@ -138,7 +145,7 @@ class GraphProfiler(Callback):
self._dump_meta = bool(dump_metadata)
self._dump_tracing = bool(dump_tracing)
self._dump_event = bool(dump_event)
assert os.path.isdir(self._dir)
assert os.path.isdir(self._dir), self._dir
def _before_run(self, _):
opt = tf.RunOptions()
......
......@@ -74,6 +74,7 @@ def send_dataflow_zmq(df, addr, hwm=50, format=None, bind=False):
avg = "{:.3f}".format(sum(q) / len(q))
pbar.set_postfix({'AvgSendLat': avg})
finally:
logger.info("Exiting send_dataflow_zmq ...")
socket.setsockopt(zmq.LINGER, 0)
socket.close()
if not ctx.closed:
......
......@@ -343,7 +343,7 @@ class HorovodTrainer(SingleCostTrainer):
self.is_chief = hvd.rank() == 0
self._local_rank = hvd.local_rank()
self._average = average
logger.info("Horovod local rank={}".format(self._local_rank))
logger.info("[HorovodTrainer] local rank={}".format(self._local_rank))
super(HorovodTrainer, self).__init__()
def allreduce(self, grads):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment