Speed up TFEventWriter initialization

9b3b5413 · Yuxin Wu · dd138d5a · 9b3b5413 · 9b3b5413
Commit 9b3b5413 authored Jul 06, 2019 by Yuxin Wu
Show whitespace changes
Inline Side-by-side

Showing with 18 additions and 10 deletions

tensorpack/callbacks/group.py tensorpack/callbacks/group.py +3 -1

tensorpack/callbacks/monitor.py tensorpack/callbacks/monitor.py +15 -9

No files found.
--- a/tensorpack/callbacks/group.py
+++ b/tensorpack/callbacks/group.py
@@ -51,7 +51,9 @@ class CallbackTimeLogger(object):
 class Callbacks(Callback):
    """
    A container to hold all callbacks, and trigger them iteratively.
-    Note that it does nothing to before_run/after_run.
+
+    This is only used by the base trainer to run all the callbacks.
+    Users do not need to use this class.
    """

    def __init__(self, cbs):

--- a/tensorpack/callbacks/monitor.py
+++ b/tensorpack/callbacks/monitor.py
@@ -12,6 +12,7 @@ import time
 from collections import defaultdict
 from datetime import datetime
 import six
+import threading

 from ..compat import tfv1 as tf
 from ..libinfo import __git_version__
@@ -23,7 +24,7 @@ from .base import Callback
 __all__ = ['MonitorBase', 'Monitors',
           'TFEventWriter', 'JSONWriter',
           'ScalarPrinter', 'SendMonitorData',
-           'TrainingMonitor', 'CometMLMonitor']
+           'CometMLMonitor']


 def image_to_nhwc(arr):
@@ -53,7 +54,9 @@ class MonitorBase(Callback):
    _chief_only = False

    def setup_graph(self, trainer):
+        # Set attributes following Callback.setup_graph
        self.trainer = trainer
+        self.graph = tf.get_default_graph()
        self._setup_graph()

    def _setup_graph(self):
@@ -97,12 +100,6 @@ class MonitorBase(Callback):
    # TODO process other types


-TrainingMonitor = MonitorBase
-"""
-Old name
-"""
-
-
 class NoOpMonitor(MonitorBase):
    def __init__(self, name=None):
        self._name = name
@@ -259,8 +256,17 @@ class TFEventWriter(MonitorBase):

    def _setup_graph(self):
        self._writer = tf.summary.FileWriter(
-            self._logdir, graph=tf.get_default_graph(),
-            max_queue=self._max_queue, flush_secs=self._flush_secs)
+            self._logdir, max_queue=self._max_queue, flush_secs=self._flush_secs)
+
+    def _write_graph(self):
+        self._writer.add_graph(self.graph)
+
+    def _before_train(self):
+        # Writing the graph is expensive (takes ~2min) when the graph is large.
+        # Therefore use a separate thread. It will then run in the
+        # background while TF is warming up in the first several iterations.
+        self._write_graph_thread = threading.Thread(target=self._write_graph, daemon=True)
+        self._write_graph_thread.start()

    @HIDE_DOC
    def process_summary(self, summary):