Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
1d99dc4e
Commit
1d99dc4e
authored
Jan 07, 2018
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
organize name scopes in trainers
parent
5f750f13
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
29 additions
and
50 deletions
+29
-50
tensorpack/graph_builder/distributed.py
tensorpack/graph_builder/distributed.py
+20
-43
tensorpack/graph_builder/training.py
tensorpack/graph_builder/training.py
+9
-7
No files found.
tensorpack/graph_builder/distributed.py
View file @
1d99dc4e
...
...
@@ -4,7 +4,7 @@
import
tensorflow
as
tf
import
re
from
six.moves
import
zip
,
range
from
six.moves
import
range
from
..utils.argtools
import
memoized
from
..tfutils.common
import
get_op_tensor_name
,
get_global_step_var
...
...
@@ -194,32 +194,6 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
# Device for queues for managing synchronization between servers
self
.
sync_queue_devices
=
[
'/job:ps/task:
%
s/cpu:0'
%
i
for
i
in
range
(
self
.
num_ps
)]
@
staticmethod
def
_average_grads
(
tower_grads
,
devices
):
"""
Average grads from towers.
The device where the average happens is chosen with round-robin.
Args:
tower_grads: Ngpu x Nvar x 2
Returns:
Nvar x 2
"""
nr_device
=
len
(
devices
)
if
nr_device
==
1
:
return
tower_grads
[
0
]
new_tower_grads
=
[]
with
tf
.
name_scope
(
'AvgGrad'
):
for
i
,
grad_and_vars
in
enumerate
(
zip
(
*
tower_grads
)):
v
=
grad_and_vars
[
0
][
1
]
# Ngpu * 2
all_grads
=
[
g
for
(
g
,
_
)
in
grad_and_vars
]
with
tf
.
device
(
devices
[
i
%
nr_device
]):
grad
=
tf
.
multiply
(
tf
.
add_n
(
all_grads
),
1.0
/
nr_device
)
new_tower_grads
.
append
((
grad
,
v
))
return
new_tower_grads
@
staticmethod
def
_apply_shadow_vars
(
avg_grads
):
"""
...
...
@@ -298,7 +272,7 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
use_vs
=
[
True
]
*
len
(
self
.
towers
))
# open vs at each tower
DataParallelBuilder
.
_check_grad_list
(
grad_list
)
avg_grads
=
DistributedReplicatedBuilder
.
_average_grads
(
grad_list
,
self
.
raw_devices
)
avg_grads
=
average_grads
(
grad_list
,
devices
=
self
.
raw_devices
)
with
tf
.
device
(
self
.
param_server_device
):
ps_var_grads
=
DistributedReplicatedBuilder
.
_apply_shadow_vars
(
avg_grads
)
var_update_ops
=
self
.
_apply_gradients_and_copy
(
...
...
@@ -312,8 +286,10 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
'post_copy_barrier'
,
[
main_fetch
])
# initial local_vars syncing
with
tf
.
name_scope
(
'initial_sync_variables'
):
initial_sync_op
=
self
.
_get_initial_sync_op
()
if
len
(
self
.
_shadow_model_vars
)
and
self
.
is_chief
:
with
tf
.
name_scope
(
'sync_model_variables'
):
model_sync_op
=
self
.
_get_sync_model_vars_op
()
else
:
model_sync_op
=
None
...
...
@@ -332,6 +308,7 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
list of copy ops
"""
# TODO do this for variables together?
with
tf
.
name_scope
(
'apply_gradients'
):
var_update_ops
=
[]
for
vid
,
(
g
,
v
)
in
enumerate
(
ps_var_grads
):
# TODO do we put momentum variables into local or global?
...
...
tensorpack/graph_builder/training.py
View file @
1d99dc4e
...
...
@@ -218,6 +218,7 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
train_ops
=
[]
opt
=
get_opt_fn
()
with
tf
.
name_scope
(
'apply_gradients'
):
for
idx
,
grad_and_vars
in
enumerate
(
grads
):
with
tf
.
device
(
raw_devices
[
idx
]):
# apply_gradients may create variables. Make them LOCAL_VARIABLES
...
...
@@ -226,6 +227,7 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
grad_and_vars
,
name
=
'apply_grad_{}'
.
format
(
idx
)))
train_op
=
tf
.
group
(
*
train_ops
,
name
=
'train_op'
)
with
tf
.
name_scope
(
'sync_variables'
):
post_init_op
=
SyncMultiGPUReplicatedBuilder
.
get_post_init_ops
()
return
train_op
,
post_init_op
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment