Commit f2d2501b authored by Yuxin Wu's avatar Yuxin Wu

Bring in experimental performance features.

parent d3e0a688
...@@ -23,7 +23,8 @@ class DistributedReplicatedBuilder(DataParallelBuilder): ...@@ -23,7 +23,8 @@ class DistributedReplicatedBuilder(DataParallelBuilder):
and get synchronously applied to the global copy of variables located on PS. and get synchronously applied to the global copy of variables located on PS.
Then each worker copy the latest variables from PS back to local. Then each worker copy the latest variables from PS back to local.
See https://www.tensorflow.org/performance/benchmarks for details. It is an equivalent of `--variable_update=distributed_replicated` in
`tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`.
Note: Note:
Gradients are not averaged across workers, but applied to PS variables Gradients are not averaged across workers, but applied to PS variables
......
...@@ -175,10 +175,10 @@ class ModelDesc(ModelDescBase): ...@@ -175,10 +175,10 @@ class ModelDesc(ModelDescBase):
self.build_graph(*inputs) self.build_graph(*inputs)
return self.get_cost() return self.get_cost()
# TODO this is deprecated and only used for v1 trainers
def _build_graph_get_grads(self, *inputs): def _build_graph_get_grads(self, *inputs):
""" """
Build the graph from inputs and return the grads. Build the graph from inputs and return the grads.
This is useful for most of the :class:`GraphBuilder` which expects such a function.
Returns: Returns:
[(grad, var)] [(grad, var)]
......
...@@ -106,7 +106,8 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder): ...@@ -106,7 +106,8 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder):
shared variable scope. It synchronoizes the gradients computed shared variable scope. It synchronoizes the gradients computed
from each tower, averages them and applies to the shared variables. from each tower, averages them and applies to the shared variables.
See https://www.tensorflow.org/performance/benchmarks for details. It is an equivalent of `--variable_update=parameter_server` in
`tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`.
""" """
def __init__(self, towers, ps_device=None): def __init__(self, towers, ps_device=None):
""" """
...@@ -164,7 +165,8 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder): ...@@ -164,7 +165,8 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
It will build one tower on each GPU under its own variable scope. It will build one tower on each GPU under its own variable scope.
Each gradient update is averaged across or GPUs through NCCL. Each gradient update is averaged across or GPUs through NCCL.
See https://www.tensorflow.org/performance/benchmarks for details. It is an equivalent of `--variable_update=replicated` in
`tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`.
""" """
def build(self, get_grad_fn, get_opt_fn): def build(self, get_grad_fn, get_opt_fn):
......
...@@ -22,9 +22,16 @@ except (ImportError, TypeError): ...@@ -22,9 +22,16 @@ except (ImportError, TypeError):
pass pass
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # issue#9339 os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # issue#9339
os.environ['TF_AUTOTUNE_THRESHOLD'] = '3' # use more warm-up os.environ['TF_AUTOTUNE_THRESHOLD'] = '2' # use more warm-up
# Since 1.3, this is not needed
os.environ['TF_AVGPOOL_USE_CUDNN'] = '1' # issue#8566 os.environ['TF_AVGPOOL_USE_CUDNN'] = '1' # issue#8566
# TF1.5 features from tensorflow/benchmarks
os.environ['TF_SYNC_ON_FINISH'] = '0' # will become default
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_GPU_THREAD_COUNT'] = '2'
try: try:
import tensorflow as tf # noqa import tensorflow as tf # noqa
_version = tf.__version__.split('.') _version = tf.__version__.split('.')
......
...@@ -32,21 +32,19 @@ def get_default_sess_config(mem_fraction=0.99): ...@@ -32,21 +32,19 @@ def get_default_sess_config(mem_fraction=0.99):
conf.allow_soft_placement = True conf.allow_soft_placement = True
# conf.log_device_placement = True # conf.log_device_placement = True
# https://github.com/tensorflow/tensorflow/issues/9322#issuecomment-295758107
# can speed up a bit
conf.intra_op_parallelism_threads = 1 conf.intra_op_parallelism_threads = 1
conf.inter_op_parallelism_threads = 0 conf.inter_op_parallelism_threads = 0
# TF benchmark use cpu_count() - gpu_thread_count(), e.g. 80 - 8 * 2
# Didn't see much difference.
conf.gpu_options.per_process_gpu_memory_fraction = mem_fraction conf.gpu_options.per_process_gpu_memory_fraction = mem_fraction
if get_tf_version_number() >= 1.2: if get_tf_version_number() >= 1.2:
conf.gpu_options.force_gpu_compatible = True conf.gpu_options.force_gpu_compatible = True
conf.gpu_options.allocator_type = 'BFC'
conf.gpu_options.allow_growth = True conf.gpu_options.allow_growth = True
# May hurt performance # May hurt performance
# conf.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 # conf.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
# TODO test this
# conf.graph_options.place_pruned_graph = True # conf.graph_options.place_pruned_graph = True
return conf return conf
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment