Commit f2d2501b authored by Yuxin Wu's avatar Yuxin Wu

Bring in experimental performance features.

parent d3e0a688
......@@ -23,7 +23,8 @@ class DistributedReplicatedBuilder(DataParallelBuilder):
and get synchronously applied to the global copy of variables located on PS.
Then each worker copy the latest variables from PS back to local.
See https://www.tensorflow.org/performance/benchmarks for details.
It is an equivalent of `--variable_update=distributed_replicated` in
`tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`.
Note:
Gradients are not averaged across workers, but applied to PS variables
......
......@@ -175,10 +175,10 @@ class ModelDesc(ModelDescBase):
self.build_graph(*inputs)
return self.get_cost()
# TODO this is deprecated and only used for v1 trainers
def _build_graph_get_grads(self, *inputs):
"""
Build the graph from inputs and return the grads.
This is useful for most of the :class:`GraphBuilder` which expects such a function.
Returns:
[(grad, var)]
......
......@@ -106,7 +106,8 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder):
shared variable scope. It synchronoizes the gradients computed
from each tower, averages them and applies to the shared variables.
See https://www.tensorflow.org/performance/benchmarks for details.
It is an equivalent of `--variable_update=parameter_server` in
`tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`.
"""
def __init__(self, towers, ps_device=None):
"""
......@@ -164,7 +165,8 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
It will build one tower on each GPU under its own variable scope.
Each gradient update is averaged across or GPUs through NCCL.
See https://www.tensorflow.org/performance/benchmarks for details.
It is an equivalent of `--variable_update=replicated` in
`tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`.
"""
def build(self, get_grad_fn, get_opt_fn):
......
......@@ -22,9 +22,16 @@ except (ImportError, TypeError):
pass
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # issue#9339
os.environ['TF_AUTOTUNE_THRESHOLD'] = '3' # use more warm-up
os.environ['TF_AUTOTUNE_THRESHOLD'] = '2' # use more warm-up
# Since 1.3, this is not needed
os.environ['TF_AVGPOOL_USE_CUDNN'] = '1' # issue#8566
# TF1.5 features from tensorflow/benchmarks
os.environ['TF_SYNC_ON_FINISH'] = '0' # will become default
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_GPU_THREAD_COUNT'] = '2'
try:
import tensorflow as tf # noqa
_version = tf.__version__.split('.')
......
......@@ -32,21 +32,19 @@ def get_default_sess_config(mem_fraction=0.99):
conf.allow_soft_placement = True
# conf.log_device_placement = True
# https://github.com/tensorflow/tensorflow/issues/9322#issuecomment-295758107
# can speed up a bit
conf.intra_op_parallelism_threads = 1
conf.inter_op_parallelism_threads = 0
# TF benchmark use cpu_count() - gpu_thread_count(), e.g. 80 - 8 * 2
# Didn't see much difference.
conf.gpu_options.per_process_gpu_memory_fraction = mem_fraction
if get_tf_version_number() >= 1.2:
conf.gpu_options.force_gpu_compatible = True
conf.gpu_options.allocator_type = 'BFC'
conf.gpu_options.allow_growth = True
# May hurt performance
# conf.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
# TODO test this
# conf.graph_options.place_pruned_graph = True
return conf
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment