Bring in experimental performance features.

f2d2501b · Yuxin Wu · d3e0a688 · f2d2501b · f2d2501b · f2d2501b
Commit f2d2501b authored Dec 16, 2017 by Yuxin Wu
5 changed files
--- a/tensorpack/graph_builder/distributed.py
+++ b/tensorpack/graph_builder/distributed.py
@@ -23,7 +23,8 @@ class DistributedReplicatedBuilder(DataParallelBuilder):
    and get synchronously applied to the global copy of variables located on PS.
    Then each worker copy the latest variables from PS back to local.

-    See https://www.tensorflow.org/performance/benchmarks for details.
+    It is an equivalent of `--variable_update=distributed_replicated` in
+    `tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`.

    Note:
        Gradients are not averaged across workers, but applied to PS variables

--- a/tensorpack/graph_builder/model_desc.py
+++ b/tensorpack/graph_builder/model_desc.py
@@ -175,10 +175,10 @@ class ModelDesc(ModelDescBase):
        self.build_graph(*inputs)
        return self.get_cost()

+    # TODO this is deprecated and only used for v1 trainers
    def _build_graph_get_grads(self, *inputs):
        """
        Build the graph from inputs and return the grads.
-        This is useful for most of the :class:`GraphBuilder` which expects such a function.

        Returns:
            [(grad, var)]

--- a/tensorpack/graph_builder/training.py
+++ b/tensorpack/graph_builder/training.py
@@ -106,7 +106,8 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder):
    shared variable scope. It synchronoizes the gradients computed
    from each tower, averages them and applies to the shared variables.

-    See https://www.tensorflow.org/performance/benchmarks for details.
+    It is an equivalent of `--variable_update=parameter_server` in
+    `tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`.
    """
    def __init__(self, towers, ps_device=None):
        """
@@ -164,7 +165,8 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
    It will build one tower on each GPU under its own variable scope.
    Each gradient update is averaged across or GPUs through NCCL.

-    See https://www.tensorflow.org/performance/benchmarks for details.
+    It is an equivalent of `--variable_update=replicated` in
+    `tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`.
    """

    def build(self, get_grad_fn, get_opt_fn):

--- a/tensorpack/libinfo.py
+++ b/tensorpack/libinfo.py
@@ -22,9 +22,16 @@ except (ImportError, TypeError):
    pass

 os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'  # issue#9339
-os.environ['TF_AUTOTUNE_THRESHOLD'] = '3'   # use more warm-up
+os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'   # use more warm-up
+
+# Since 1.3, this is not needed
 os.environ['TF_AVGPOOL_USE_CUDNN'] = '1'   # issue#8566

+# TF1.5 features from tensorflow/benchmarks
+os.environ['TF_SYNC_ON_FINISH'] = '0'   # will become default
+os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
+os.environ['TF_GPU_THREAD_COUNT'] = '2'
+
 try:
    import tensorflow as tf  # noqa
    _version = tf.__version__.split('.')

--- a/tensorpack/tfutils/common.py
+++ b/tensorpack/tfutils/common.py
@@ -32,21 +32,19 @@ def get_default_sess_config(mem_fraction=0.99):
    conf.allow_soft_placement = True
    # conf.log_device_placement = True

-    # https://github.com/tensorflow/tensorflow/issues/9322#issuecomment-295758107
-    # can speed up a bit
    conf.intra_op_parallelism_threads = 1
    conf.inter_op_parallelism_threads = 0
+    # TF benchmark use cpu_count() - gpu_thread_count(), e.g. 80 - 8 * 2
+    # Didn't see much difference.

    conf.gpu_options.per_process_gpu_memory_fraction = mem_fraction
    if get_tf_version_number() >= 1.2:
        conf.gpu_options.force_gpu_compatible = True

-    conf.gpu_options.allocator_type = 'BFC'
    conf.gpu_options.allow_growth = True

    # May hurt performance
    # conf.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
-    # TODO test this
    # conf.graph_options.place_pruned_graph = True
    return conf