Commit fa34d239 authored by Yuxin Wu's avatar Yuxin Wu

Make all monitors chief_only=False by default, improve warnings (#791)

parent b96cb78c
...@@ -46,6 +46,9 @@ class TrainingMonitor(Callback): ...@@ -46,6 +46,9 @@ class TrainingMonitor(Callback):
.. document private functions .. document private functions
.. automethod:: _setup_graph .. automethod:: _setup_graph
""" """
_chief_only = False
def setup_graph(self, trainer): def setup_graph(self, trainer):
self.trainer = trainer self.trainer = trainer
self._setup_graph() self._setup_graph()
...@@ -92,7 +95,13 @@ class TrainingMonitor(Callback): ...@@ -92,7 +95,13 @@ class TrainingMonitor(Callback):
class NoOpMonitor(TrainingMonitor): class NoOpMonitor(TrainingMonitor):
pass def __init__(self, name=None):
self._name = name
def __str__(self):
if self._name is None:
return "NoOpMonitor"
return "NoOpMonitor({})".format(self._name)
class Monitors(Callback): class Monitors(Callback):
...@@ -221,7 +230,7 @@ class TFEventWriter(TrainingMonitor): ...@@ -221,7 +230,7 @@ class TFEventWriter(TrainingMonitor):
self._flush_secs = flush_secs self._flush_secs = flush_secs
self._split_files = split_files self._split_files = split_files
def __new__(cls, logdir=None, max_queue=10, flush_secs=120): def __new__(cls, logdir=None, max_queue=10, flush_secs=120, **kwargs):
if logdir is None: if logdir is None:
logdir = logger.get_logger_dir() logdir = logger.get_logger_dir()
...@@ -229,7 +238,7 @@ class TFEventWriter(TrainingMonitor): ...@@ -229,7 +238,7 @@ class TFEventWriter(TrainingMonitor):
return super(TFEventWriter, cls).__new__(cls) return super(TFEventWriter, cls).__new__(cls)
else: else:
logger.warn("logger directory was not set. Ignore TFEventWriter.") logger.warn("logger directory was not set. Ignore TFEventWriter.")
return NoOpMonitor() return NoOpMonitor("TFEventWriter")
def _setup_graph(self): def _setup_graph(self):
self._writer = tf.summary.FileWriter( self._writer = tf.summary.FileWriter(
...@@ -268,7 +277,7 @@ class JSONWriter(TrainingMonitor): ...@@ -268,7 +277,7 @@ class JSONWriter(TrainingMonitor):
return super(JSONWriter, cls).__new__(cls) return super(JSONWriter, cls).__new__(cls)
else: else:
logger.warn("logger directory was not set. Ignore JSONWriter.") logger.warn("logger directory was not set. Ignore JSONWriter.")
return NoOpMonitor() return NoOpMonitor("JSONWriter")
@staticmethod @staticmethod
def load_existing_json(): def load_existing_json():
...@@ -370,8 +379,6 @@ class ScalarPrinter(TrainingMonitor): ...@@ -370,8 +379,6 @@ class ScalarPrinter(TrainingMonitor):
Print scalar data into terminal. Print scalar data into terminal.
""" """
_chief_only = False
def __init__(self, enable_step=False, enable_epoch=True, def __init__(self, enable_step=False, enable_epoch=True,
whitelist=None, blacklist=None): whitelist=None, blacklist=None):
""" """
...@@ -439,8 +446,6 @@ class ScalarHistory(TrainingMonitor): ...@@ -439,8 +446,6 @@ class ScalarHistory(TrainingMonitor):
Only used by monitors internally. Only used by monitors internally.
""" """
_chief_only = False
def _setup_graph(self): def _setup_graph(self):
self._dic = defaultdict(list) self._dic = defaultdict(list)
......
...@@ -114,7 +114,7 @@ class GPUUtilizationTracker(Callback): ...@@ -114,7 +114,7 @@ class GPUUtilizationTracker(Callback):
class GraphProfiler(Callback): class GraphProfiler(Callback):
""" """
Enable profiling by installing session hooks, Enable profiling by installing session hooks,
and write metadata or tracing files to ``logger.get_logger_dir()``. and write tracing files / events / metadata to ``logger.get_logger_dir()``.
The tracing files can be loaded from ``chrome://tracing``. The tracing files can be loaded from ``chrome://tracing``.
The metadata files can be processed by The metadata files can be processed by
...@@ -122,9 +122,16 @@ class GraphProfiler(Callback): ...@@ -122,9 +122,16 @@ class GraphProfiler(Callback):
<https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/profiler/README.md>`_. <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/profiler/README.md>`_.
The event is viewable from tensorboard. The event is viewable from tensorboard.
Note that the profiling is enabled for every step. Tips:
You probably want to schedule it less frequently by
:class:`PeriodicRunHooks`. Note that the profiling is by default enabled for every step and is expensive.
You probably want to schedule it less frequently, e.g.:
.. code-block:: none
EnableCallbackIf(
GraphProfiler(dump_tracing=True, dump_event=True),
lambda self: self.trainer.global_step > 20 and self.trainer.global_step < 30)
""" """
def __init__(self, dump_metadata=False, dump_tracing=True, dump_event=False): def __init__(self, dump_metadata=False, dump_tracing=True, dump_event=False):
""" """
...@@ -138,7 +145,7 @@ class GraphProfiler(Callback): ...@@ -138,7 +145,7 @@ class GraphProfiler(Callback):
self._dump_meta = bool(dump_metadata) self._dump_meta = bool(dump_metadata)
self._dump_tracing = bool(dump_tracing) self._dump_tracing = bool(dump_tracing)
self._dump_event = bool(dump_event) self._dump_event = bool(dump_event)
assert os.path.isdir(self._dir) assert os.path.isdir(self._dir), self._dir
def _before_run(self, _): def _before_run(self, _):
opt = tf.RunOptions() opt = tf.RunOptions()
......
...@@ -74,6 +74,7 @@ def send_dataflow_zmq(df, addr, hwm=50, format=None, bind=False): ...@@ -74,6 +74,7 @@ def send_dataflow_zmq(df, addr, hwm=50, format=None, bind=False):
avg = "{:.3f}".format(sum(q) / len(q)) avg = "{:.3f}".format(sum(q) / len(q))
pbar.set_postfix({'AvgSendLat': avg}) pbar.set_postfix({'AvgSendLat': avg})
finally: finally:
logger.info("Exiting send_dataflow_zmq ...")
socket.setsockopt(zmq.LINGER, 0) socket.setsockopt(zmq.LINGER, 0)
socket.close() socket.close()
if not ctx.closed: if not ctx.closed:
......
...@@ -343,7 +343,7 @@ class HorovodTrainer(SingleCostTrainer): ...@@ -343,7 +343,7 @@ class HorovodTrainer(SingleCostTrainer):
self.is_chief = hvd.rank() == 0 self.is_chief = hvd.rank() == 0
self._local_rank = hvd.local_rank() self._local_rank = hvd.local_rank()
self._average = average self._average = average
logger.info("Horovod local rank={}".format(self._local_rank)) logger.info("[HorovodTrainer] local rank={}".format(self._local_rank))
super(HorovodTrainer, self).__init__() super(HorovodTrainer, self).__init__()
def allreduce(self, grads): def allreduce(self, grads):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment