Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
f409fbf0
Commit
f409fbf0
authored
Oct 19, 2017
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Move gradient average utilities
parent
d53b5c4c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
80 additions
and
51 deletions
+80
-51
tensorpack/graph_builder/__init__.py
tensorpack/graph_builder/__init__.py
+1
-1
tensorpack/graph_builder/distributed.py
tensorpack/graph_builder/distributed.py
+2
-4
tensorpack/graph_builder/training.py
tensorpack/graph_builder/training.py
+8
-45
tensorpack/graph_builder/utils.py
tensorpack/graph_builder/utils.py
+69
-1
No files found.
tensorpack/graph_builder/__init__.py
View file @
f409fbf0
...
...
@@ -20,7 +20,7 @@ def global_import(name):
_CURR_DIR
=
os
.
path
.
dirname
(
__file__
)
_SKIP
=
[]
_SKIP
=
[
'utils'
]
for
_
,
module_name
,
_
in
iter_modules
(
[
_CURR_DIR
]):
srcpath
=
os
.
path
.
join
(
_CURR_DIR
,
module_name
+
'.py'
)
...
...
tensorpack/graph_builder/distributed.py
View file @
f409fbf0
...
...
@@ -100,11 +100,9 @@ class DistributedReplicatedBuilder(DataParallelBuilder):
new_tower_grads
=
[]
with
tf
.
name_scope
(
'AvgGrad'
):
for
i
,
grad_and_vars
in
enumerate
(
zip
(
*
tower_grads
)):
# Ngpu * 2
v
=
grad_and_vars
[
0
][
1
]
# Ngpu * 2
all_grads
=
[
g
for
(
g
,
_
)
in
grad_and_vars
]
with
tf
.
device
(
devices
[
i
%
nr_device
]):
v
=
grad_and_vars
[
0
][
1
]
# average gradient
all_grads
=
[
g
for
(
g
,
_
)
in
grad_and_vars
]
grad
=
tf
.
multiply
(
tf
.
add_n
(
all_grads
),
1.0
/
nr_device
)
new_tower_grads
.
append
((
grad
,
v
))
...
...
tensorpack/graph_builder/training.py
View file @
f409fbf0
...
...
@@ -14,7 +14,9 @@ from ..tfutils.collection import backup_collection, restore_collection
from
..tfutils.gradproc
import
ScaleGradient
from
..utils.naming
import
TOWER_FREEZE_KEYS
from
.utils
import
LeastLoadedDeviceSetter
,
override_to_local_variable
from
.utils
import
(
LeastLoadedDeviceSetter
,
override_to_local_variable
,
allreduce_grads
,
average_grads
)
__all__
=
[
'GraphBuilder'
,
...
...
@@ -123,25 +125,6 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder):
assert
ps_device
in
[
'cpu'
,
'gpu'
]
self
.
ps_device
=
ps_device
@
staticmethod
def
_average_grads
(
tower_grads
):
# tower_grads: Ngpu x Nvar x 2
nr_tower
=
len
(
tower_grads
)
if
nr_tower
==
1
:
return
tower_grads
[
0
]
new_tower_grads
=
[]
with
tf
.
name_scope
(
'AvgGrad'
):
for
grad_and_vars
in
zip
(
*
tower_grads
):
# Ngpu * 2
v
=
grad_and_vars
[
0
][
1
]
all_grads
=
[
g
for
(
g
,
_
)
in
grad_and_vars
]
with
tf
.
device
(
v
.
device
):
# colocate summed grad with var
grad
=
tf
.
multiply
(
tf
.
add_n
(
all_grads
),
1.0
/
nr_tower
)
new_tower_grads
.
append
((
grad
,
v
))
return
new_tower_grads
def
build
(
self
,
get_grad_fn
,
get_opt_fn
):
"""
Args:
...
...
@@ -166,7 +149,7 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder):
# self.train_op = tf.group(*ops)
# return
grads
=
SyncMultiGPUParameterServerBuilder
.
_
average_grads
(
grad_list
)
grads
=
average_grads
(
grad_list
)
# grads = grad_list[0]
opt
=
get_opt_fn
()
...
...
@@ -187,27 +170,6 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
See https://www.tensorflow.org/performance/benchmarks for details.
"""
@
staticmethod
def
_allreduce_grads
(
tower_grads
):
from
tensorflow.contrib
import
nccl
nr_tower
=
len
(
tower_grads
)
if
nr_tower
==
1
:
return
[[
x
]
for
x
in
tower_grads
[
0
]]
new_tower_grads
=
[]
with
tf
.
name_scope
(
'AvgGrad'
):
for
grad_and_vars
in
zip
(
*
tower_grads
):
v
=
grad_and_vars
[
0
][
1
]
grads
=
[
g
for
g
,
_
in
grad_and_vars
]
summed
=
nccl
.
all_sum
(
grads
)
grads_for_a_var
=
[]
for
(
_
,
v
),
g
in
zip
(
grad_and_vars
,
summed
):
with
tf
.
device
(
g
.
device
):
g
=
tf
.
multiply
(
g
,
1.0
/
nr_tower
)
grads_for_a_var
.
append
((
g
,
v
))
new_tower_grads
.
append
(
grads_for_a_var
)
# NVar * NGPU * 2
return
new_tower_grads
def
build
(
self
,
get_grad_fn
,
get_opt_fn
):
"""
...
...
@@ -231,13 +193,14 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
get_grad_fn
,
# use no variable scope for the first tower
use_vs
=
[
False
]
+
[
True
]
*
(
len
(
self
.
towers
)
-
1
))
grads
=
SyncMultiGPUReplicatedBuilder
.
_allreduce_grads
(
grad_list
)
DataParallelBuilder
.
_check_grad_list
(
grad_list
)
grads
=
allreduce_grads
(
grad_list
)
train_ops
=
[]
opt
=
get_opt_fn
()
for
idx
in
range
(
len
(
self
.
towers
)
):
for
idx
,
grad_and_vars
in
enumerate
(
grads
):
with
tf
.
device
(
raw_devices
[
idx
]):
grad_and_vars
=
[
x
[
idx
]
for
x
in
grads
]
# apply_gradients may create variables. Make them LOCAL_VARIABLES
with
override_to_local_variable
(
enable
=
idx
>
0
):
train_ops
.
append
(
opt
.
apply_gradients
(
...
...
tensorpack/graph_builder/utils.py
View file @
f409fbf0
...
...
@@ -9,7 +9,12 @@ import tensorflow as tf
__all__
=
[
'LeastLoadedDeviceSetter'
,
'OverrideToLocalVariable'
,
'override_to_local_variable'
]
'override_to_local_variable'
,
'allreduce_grads'
,
'average_grads'
]
"""
Some utilities for building the graph.
"""
@
contextmanager
...
...
@@ -73,3 +78,66 @@ class LeastLoadedDeviceSetter(object):
def
__str__
(
self
):
return
"LeastLoadedDeviceSetter-{}"
.
format
(
self
.
worker_device
)
def
allreduce_grads
(
all_grads
):
"""
All-reduce average the gradients among devices. Results are broadcasted to all devices.
Args:
all_grads (K x N x 2): A list of K lists. Each of the list is a list of N (grad, var) tuples.
The variables have to be the same across the K lists.
Returns:
(K x N x 2): same as input, but each grad is replaced by the average over K lists.
"""
from
tensorflow.contrib
import
nccl
nr_tower
=
len
(
all_grads
)
if
nr_tower
==
1
:
return
[[
x
]
for
x
in
all_grads
[
0
]]
new_all_grads
=
[]
# NVar * NGPU * 2
with
tf
.
name_scope
(
'AvgGrad'
):
for
grad_and_vars
in
zip
(
*
all_grads
):
v
=
grad_and_vars
[
0
][
1
]
grads
=
[
g
for
g
,
_
in
grad_and_vars
]
summed
=
nccl
.
all_sum
(
grads
)
grads_for_a_var
=
[]
for
(
_
,
v
),
g
in
zip
(
grad_and_vars
,
summed
):
with
tf
.
device
(
g
.
device
):
g
=
tf
.
multiply
(
g
,
1.0
/
nr_tower
)
grads_for_a_var
.
append
((
g
,
v
))
new_all_grads
.
append
(
grads_for_a_var
)
# transpose
ret
=
[
k
for
k
in
zip
(
*
new_all_grads
)]
return
ret
def
average_grads
(
all_grads
):
"""
Average the gradients, on the device of each variable.
Args:
all_grads (K x N x 2): A list of K lists. Each of the list is a list of N (grad, var) tuples.
The variables have to be the same across the K lists.
Returns:
(N x 2): A list of N (grad, var) tuples, where grad is averaged over K.
"""
nr_tower
=
len
(
all_grads
)
if
nr_tower
==
1
:
return
all_grads
[
0
]
ret
=
[]
with
tf
.
name_scope
(
'AvgGrad'
):
for
grad_and_vars
in
zip
(
*
all_grads
):
# Ngpu * 2
v
=
grad_and_vars
[
0
][
1
]
all_grads
=
[
g
for
(
g
,
_
)
in
grad_and_vars
]
with
tf
.
device
(
v
.
device
):
# colocate summed grad with var
grad
=
tf
.
multiply
(
tf
.
add_n
(
all_grads
),
1.0
/
nr_tower
)
ret
.
append
((
grad
,
v
))
return
ret
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment