Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
1d99dc4e
You need to sign in or sign up before continuing.
Commit
1d99dc4e
authored
Jan 07, 2018
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
organize name scopes in trainers
parent
5f750f13
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
29 additions
and
50 deletions
+29
-50
tensorpack/graph_builder/distributed.py
tensorpack/graph_builder/distributed.py
+20
-43
tensorpack/graph_builder/training.py
tensorpack/graph_builder/training.py
+9
-7
No files found.
tensorpack/graph_builder/distributed.py
View file @
1d99dc4e
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
import
tensorflow
as
tf
import
tensorflow
as
tf
import
re
import
re
from
six.moves
import
zip
,
range
from
six.moves
import
range
from
..utils.argtools
import
memoized
from
..utils.argtools
import
memoized
from
..tfutils.common
import
get_op_tensor_name
,
get_global_step_var
from
..tfutils.common
import
get_op_tensor_name
,
get_global_step_var
...
@@ -194,32 +194,6 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
...
@@ -194,32 +194,6 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
# Device for queues for managing synchronization between servers
# Device for queues for managing synchronization between servers
self
.
sync_queue_devices
=
[
'/job:ps/task:
%
s/cpu:0'
%
i
for
i
in
range
(
self
.
num_ps
)]
self
.
sync_queue_devices
=
[
'/job:ps/task:
%
s/cpu:0'
%
i
for
i
in
range
(
self
.
num_ps
)]
@
staticmethod
def
_average_grads
(
tower_grads
,
devices
):
"""
Average grads from towers.
The device where the average happens is chosen with round-robin.
Args:
tower_grads: Ngpu x Nvar x 2
Returns:
Nvar x 2
"""
nr_device
=
len
(
devices
)
if
nr_device
==
1
:
return
tower_grads
[
0
]
new_tower_grads
=
[]
with
tf
.
name_scope
(
'AvgGrad'
):
for
i
,
grad_and_vars
in
enumerate
(
zip
(
*
tower_grads
)):
v
=
grad_and_vars
[
0
][
1
]
# Ngpu * 2
all_grads
=
[
g
for
(
g
,
_
)
in
grad_and_vars
]
with
tf
.
device
(
devices
[
i
%
nr_device
]):
grad
=
tf
.
multiply
(
tf
.
add_n
(
all_grads
),
1.0
/
nr_device
)
new_tower_grads
.
append
((
grad
,
v
))
return
new_tower_grads
@
staticmethod
@
staticmethod
def
_apply_shadow_vars
(
avg_grads
):
def
_apply_shadow_vars
(
avg_grads
):
"""
"""
...
@@ -298,7 +272,7 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
...
@@ -298,7 +272,7 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
use_vs
=
[
True
]
*
len
(
self
.
towers
))
# open vs at each tower
use_vs
=
[
True
]
*
len
(
self
.
towers
))
# open vs at each tower
DataParallelBuilder
.
_check_grad_list
(
grad_list
)
DataParallelBuilder
.
_check_grad_list
(
grad_list
)
avg_grads
=
DistributedReplicatedBuilder
.
_average_grads
(
grad_list
,
self
.
raw_devices
)
avg_grads
=
average_grads
(
grad_list
,
devices
=
self
.
raw_devices
)
with
tf
.
device
(
self
.
param_server_device
):
with
tf
.
device
(
self
.
param_server_device
):
ps_var_grads
=
DistributedReplicatedBuilder
.
_apply_shadow_vars
(
avg_grads
)
ps_var_grads
=
DistributedReplicatedBuilder
.
_apply_shadow_vars
(
avg_grads
)
var_update_ops
=
self
.
_apply_gradients_and_copy
(
var_update_ops
=
self
.
_apply_gradients_and_copy
(
...
@@ -312,9 +286,11 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
...
@@ -312,9 +286,11 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
'post_copy_barrier'
,
[
main_fetch
])
'post_copy_barrier'
,
[
main_fetch
])
# initial local_vars syncing
# initial local_vars syncing
initial_sync_op
=
self
.
_get_initial_sync_op
()
with
tf
.
name_scope
(
'initial_sync_variables'
):
initial_sync_op
=
self
.
_get_initial_sync_op
()
if
len
(
self
.
_shadow_model_vars
)
and
self
.
is_chief
:
if
len
(
self
.
_shadow_model_vars
)
and
self
.
is_chief
:
model_sync_op
=
self
.
_get_sync_model_vars_op
()
with
tf
.
name_scope
(
'sync_model_variables'
):
model_sync_op
=
self
.
_get_sync_model_vars_op
()
else
:
else
:
model_sync_op
=
None
model_sync_op
=
None
return
train_op
,
initial_sync_op
,
model_sync_op
return
train_op
,
initial_sync_op
,
model_sync_op
...
@@ -332,19 +308,20 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
...
@@ -332,19 +308,20 @@ class DistributedReplicatedBuilder(DataParallelBuilder, DistributedBuilderBase):
list of copy ops
list of copy ops
"""
"""
# TODO do this for variables together?
# TODO do this for variables together?
var_update_ops
=
[]
with
tf
.
name_scope
(
'apply_gradients'
):
for
vid
,
(
g
,
v
)
in
enumerate
(
ps_var_grads
):
var_update_ops
=
[]
# TODO do we put momentum variables into local or global?
for
vid
,
(
g
,
v
)
in
enumerate
(
ps_var_grads
):
apply_gradient_op
=
opt
.
apply_gradients
([(
g
,
v
)])
# TODO do we put momentum variables into local or global?
barrier
=
self
.
_add_sync_queues_and_barrier
(
apply_gradient_op
=
opt
.
apply_gradients
([(
g
,
v
)])
'param_update_barrier_{}'
.
format
(
vid
),
[
apply_gradient_op
])
barrier
=
self
.
_add_sync_queues_and_barrier
(
with
tf
.
control_dependencies
([
barrier
]),
\
'param_update_barrier_{}'
.
format
(
vid
),
[
apply_gradient_op
])
tf
.
device
(
self
.
cpu_device
):
with
tf
.
control_dependencies
([
barrier
]),
\
updated_value
=
v
.
read_value
()
tf
.
device
(
self
.
cpu_device
):
for
towerid
in
range
(
self
.
nr_gpu
):
updated_value
=
v
.
read_value
()
var_update_ops
.
append
(
for
towerid
in
range
(
self
.
nr_gpu
):
raw_grad_list
[
towerid
][
vid
][
1
]
.
assign
(
updated_value
))
var_update_ops
.
append
(
return
var_update_ops
raw_grad_list
[
towerid
][
vid
][
1
]
.
assign
(
updated_value
))
return
var_update_ops
def
_get_initial_sync_op
(
self
):
def
_get_initial_sync_op
(
self
):
"""
"""
...
...
tensorpack/graph_builder/training.py
View file @
1d99dc4e
...
@@ -218,15 +218,17 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
...
@@ -218,15 +218,17 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
train_ops
=
[]
train_ops
=
[]
opt
=
get_opt_fn
()
opt
=
get_opt_fn
()
for
idx
,
grad_and_vars
in
enumerate
(
grads
):
with
tf
.
name_scope
(
'apply_gradients'
):
with
tf
.
device
(
raw_devices
[
idx
]):
for
idx
,
grad_and_vars
in
enumerate
(
grads
):
# apply_gradients may create variables. Make them LOCAL_VARIABLES
with
tf
.
device
(
raw_devices
[
idx
]):
with
override_to_local_variable
(
enable
=
idx
>
0
):
# apply_gradients may create variables. Make them LOCAL_VARIABLES
train_ops
.
append
(
opt
.
apply_gradients
(
with
override_to_local_variable
(
enable
=
idx
>
0
):
grad_and_vars
,
name
=
'apply_grad_{}'
.
format
(
idx
)))
train_ops
.
append
(
opt
.
apply_gradients
(
grad_and_vars
,
name
=
'apply_grad_{}'
.
format
(
idx
)))
train_op
=
tf
.
group
(
*
train_ops
,
name
=
'train_op'
)
train_op
=
tf
.
group
(
*
train_ops
,
name
=
'train_op'
)
post_init_op
=
SyncMultiGPUReplicatedBuilder
.
get_post_init_ops
()
with
tf
.
name_scope
(
'sync_variables'
):
post_init_op
=
SyncMultiGPUReplicatedBuilder
.
get_post_init_ops
()
return
train_op
,
post_init_op
return
train_op
,
post_init_op
# Adopt from https://github.com/tensorflow/benchmarks/blob/master/scripts/tf_cnn_benchmarks/variable_mgr.py
# Adopt from https://github.com/tensorflow/benchmarks/blob/master/scripts/tf_cnn_benchmarks/variable_mgr.py
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment