Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
b0677681
Commit
b0677681
authored
Jun 01, 2017
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
split variable strategies into methods
parent
9fd5cb9f
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
50 additions
and
33 deletions
+50
-33
tensorpack/train/distributed.py
tensorpack/train/distributed.py
+50
-33
No files found.
tensorpack/train/distributed.py
View file @
b0677681
...
@@ -20,6 +20,7 @@ from ..callbacks.monitor import Monitors
...
@@ -20,6 +20,7 @@ from ..callbacks.monitor import Monitors
__all__
=
[
'DistributedReplicatedTrainer'
]
__all__
=
[
'DistributedReplicatedTrainer'
]
# Note that only trainable vars are shadowed
PS_SHADOW_VAR_PREFIX
=
'ps_var'
PS_SHADOW_VAR_PREFIX
=
'ps_var'
...
@@ -83,8 +84,12 @@ class DistributedReplicatedTrainer(SingleCostFeedfreeTrainer):
...
@@ -83,8 +84,12 @@ class DistributedReplicatedTrainer(SingleCostFeedfreeTrainer):
def
_average_grads
(
tower_grads
,
devices
):
def
_average_grads
(
tower_grads
,
devices
):
"""
"""
Average grad with round-robin device selection.
Average grad with round-robin device selection.
Args:
Args:
tower_grads: Ngpu x Nvar x 2
tower_grads: Ngpu x Nvar x 2
Returns:
Nvar x 2
"""
"""
nr_device
=
len
(
devices
)
nr_device
=
len
(
devices
)
if
nr_device
==
1
:
if
nr_device
==
1
:
...
@@ -104,6 +109,46 @@ class DistributedReplicatedTrainer(SingleCostFeedfreeTrainer):
...
@@ -104,6 +109,46 @@ class DistributedReplicatedTrainer(SingleCostFeedfreeTrainer):
new_tower_grads
.
append
((
grad
,
v
))
new_tower_grads
.
append
((
grad
,
v
))
return
new_tower_grads
return
new_tower_grads
@
staticmethod
def
_apply_shadow_vars
(
avg_grads
):
"""
Replace variables in avg_grads by shadow variables.
"""
ps_var_grads
=
[]
for
grad
,
var
in
avg_grads
:
my_name
=
PS_SHADOW_VAR_PREFIX
+
'/'
+
var
.
name
my_name
=
get_op_tensor_name
(
my_name
)[
0
]
new_v
=
tf
.
get_variable
(
my_name
,
dtype
=
var
.
dtype
.
base_dtype
,
initializer
=
var
.
initial_value
,
trainable
=
True
)
# (g, v) to be applied, where v is global (ps vars)
ps_var_grads
.
append
((
grad
,
new_v
))
return
ps_var_grads
def
_apply_gradients_and_copy
(
self
,
raw_grad_list
,
ps_var_grads
):
"""
Args:
raw_grad_list: Ngpu x Nvar x 2 gradient list from all towers
ps_var_grads: Nvar x 2 (grad, ps_var)
Returns:
list of copy ops
"""
# TODO do this for each variable separately?
opt
=
self
.
model
.
get_optimizer
()
# TODO ensure it in global scope, not local
var_update_ops
=
[]
for
vid
,
(
g
,
v
)
in
enumerate
(
ps_var_grads
):
apply_gradient_op
=
opt
.
apply_gradients
([(
g
,
v
)])
barrier
=
self
.
add_sync_queues_and_barrier
(
'param_update_barrier_{}'
.
format
(
vid
),
[
apply_gradient_op
])
with
tf
.
control_dependencies
([
barrier
]),
\
tf
.
device
(
self
.
cpu_device
):
updated_value
=
v
.
read_value
()
for
towerid
in
range
(
self
.
nr_gpu
):
var_update_ops
.
append
(
raw_grad_list
[
towerid
][
vid
][
1
]
.
assign
(
updated_value
))
return
var_update_ops
def
_setup
(
self
):
def
_setup
(
self
):
conf
=
get_default_sess_config
()
conf
=
get_default_sess_config
()
self
.
server
=
tf
.
train
.
Server
(
self
.
server
=
tf
.
train
.
Server
(
...
@@ -128,35 +173,12 @@ class DistributedReplicatedTrainer(SingleCostFeedfreeTrainer):
...
@@ -128,35 +173,12 @@ class DistributedReplicatedTrainer(SingleCostFeedfreeTrainer):
var_strategy
=
'replicated'
)
var_strategy
=
'replicated'
)
avg_grads
=
DistributedReplicatedTrainer
.
_average_grads
(
grad_list
,
self
.
raw_devices
)
avg_grads
=
DistributedReplicatedTrainer
.
_average_grads
(
grad_list
,
self
.
raw_devices
)
# Nvar * 2
ps_var_grads
=
[]
for
i
,
(
grad
,
var
)
in
enumerate
(
avg_grads
):
with
tf
.
device
(
self
.
param_server_device
):
my_name
=
PS_SHADOW_VAR_PREFIX
+
'/'
+
var
.
name
my_name
=
get_op_tensor_name
(
my_name
)[
0
]
new_v
=
tf
.
get_variable
(
my_name
,
dtype
=
var
.
dtype
.
base_dtype
,
initializer
=
var
.
initial_value
,
trainable
=
True
)
# (g, v) to be applied, where v is global (ps vars)
ps_var_grads
.
append
((
grad
,
new_v
))
# apply gradients TODO do this for each variable separately?
var_update_ops
=
[]
with
tf
.
device
(
self
.
param_server_device
):
with
tf
.
device
(
self
.
param_server_device
):
for
vid
,
(
g
,
v
)
in
enumerate
(
ps_var_grads
):
ps_var_grads
=
DistributedReplicatedTrainer
.
_apply_shadow_vars
(
avg_grads
)
apply_gradient_op
=
opt
.
apply_gradients
([(
g
,
v
)])
var_update_ops
=
self
.
_apply_gradients_and_copy
(
grad_list
,
ps_var_grads
)
barrier
=
self
.
add_sync_queues_and_barrier
(
'param_update_barrier_{}'
.
format
(
vid
),
[
apply_gradient_op
])
main_fetch
=
tf
.
group
(
*
var_update_ops
,
name
=
'main_fetches'
)
with
tf
.
control_dependencies
([
barrier
]),
\
self
.
train_op
=
self
.
add_sync_queues_and_barrier
(
'sync_queues_step_end'
,
[
main_fetch
])
tf
.
device
(
self
.
cpu_device
):
updated_value
=
v
.
read_value
()
for
towerid
in
range
(
self
.
nr_gpu
):
logger
.
info
(
"Step update {} -> {}"
.
format
(
v
.
name
,
grad_list
[
towerid
][
vid
][
1
]
.
name
))
var_update_ops
.
append
(
grad_list
[
towerid
][
vid
][
1
]
.
assign
(
updated_value
))
self
.
main_fetch
=
tf
.
group
(
*
var_update_ops
,
name
=
'main_fetches'
)
#self.train_op = self.main_fetch
self
.
train_op
=
self
.
add_sync_queues_and_barrier
(
'sync_queues_step_end'
,
[
self
.
main_fetch
])
self
.
post_init_op
=
self
.
get_post_init_ops
()
self
.
post_init_op
=
self
.
get_post_init_ops
()
def
setup
(
self
):
def
setup
(
self
):
...
@@ -185,10 +207,8 @@ class DistributedReplicatedTrainer(SingleCostFeedfreeTrainer):
...
@@ -185,10 +207,8 @@ class DistributedReplicatedTrainer(SingleCostFeedfreeTrainer):
summary_op
=
None
,
summary_op
=
None
,
save_model_secs
=
0
,
save_model_secs
=
0
,
summary_writer
=
None
)
summary_writer
=
None
)
conf
=
get_default_sess_config
()
sess
=
self
.
sv
.
prepare_or_wait_for_session
(
sess
=
self
.
sv
.
prepare_or_wait_for_session
(
master
=
self
.
server
.
target
,
master
=
self
.
server
.
target
,
config
=
conf
,
start_standard_services
=
False
)
start_standard_services
=
False
)
self
.
sess
=
sess
self
.
sess
=
sess
...
@@ -198,7 +218,6 @@ class DistributedReplicatedTrainer(SingleCostFeedfreeTrainer):
...
@@ -198,7 +218,6 @@ class DistributedReplicatedTrainer(SingleCostFeedfreeTrainer):
self
.
_monitored_sess
=
tf
.
train
.
MonitoredSession
(
self
.
_monitored_sess
=
tf
.
train
.
MonitoredSession
(
session_creator
=
ReuseSessionCreator
(
self
.
sess
),
hooks
=
None
)
session_creator
=
ReuseSessionCreator
(
self
.
sess
),
hooks
=
None
)
#self._monitored_sess = self.sv
hooks
=
self
.
_callbacks
.
get_hooks
()
hooks
=
self
.
_callbacks
.
get_hooks
()
self
.
hooked_sess
=
HookedSession
(
self
.
sess
,
hooks
)
self
.
hooked_sess
=
HookedSession
(
self
.
sess
,
hooks
)
...
@@ -213,7 +232,6 @@ class DistributedReplicatedTrainer(SingleCostFeedfreeTrainer):
...
@@ -213,7 +232,6 @@ class DistributedReplicatedTrainer(SingleCostFeedfreeTrainer):
an op that should be used as control dependency before starting next step.
an op that should be used as control dependency before starting next step.
"""
"""
self
.
sync_queue_counter
+=
1
self
.
sync_queue_counter
+=
1
self
.
num_worker
=
self
.
cluster
.
num_tasks
(
'worker'
)
with
tf
.
device
(
self
.
sync_queue_devices
[
self
.
sync_queue_counter
%
len
(
self
.
sync_queue_devices
)]):
with
tf
.
device
(
self
.
sync_queue_devices
[
self
.
sync_queue_counter
%
len
(
self
.
sync_queue_devices
)]):
sync_queues
=
[
sync_queues
=
[
tf
.
FIFOQueue
(
self
.
num_worker
,
[
tf
.
bool
],
shapes
=
[[]],
tf
.
FIFOQueue
(
self
.
num_worker
,
[
tf
.
bool
],
shapes
=
[[]],
...
@@ -257,7 +275,6 @@ class DistributedReplicatedTrainer(SingleCostFeedfreeTrainer):
...
@@ -257,7 +275,6 @@ class DistributedReplicatedTrainer(SingleCostFeedfreeTrainer):
name
=
'tower
%
s/
%
s'
%
(
i
,
prefix
)
name
=
'tower
%
s/
%
s'
%
(
i
,
prefix
)
if
name
in
local_var_by_name
:
if
name
in
local_var_by_name
:
copy_to
=
local_var_by_name
[
name
]
copy_to
=
local_var_by_name
[
name
]
logger
.
info
(
"Post Init {} -> {}"
.
format
(
v
.
name
,
copy_to
.
name
))
post_init_ops
.
append
(
copy_to
.
assign
(
v
.
read_value
()))
post_init_ops
.
append
(
copy_to
.
assign
(
v
.
read_value
()))
else
:
else
:
logger
.
warn
(
"Global var {} doesn't match local var"
.
format
(
v
.
name
))
logger
.
warn
(
"Global var {} doesn't match local var"
.
format
(
v
.
name
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment