Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
74ca05dc
Commit
74ca05dc
authored
Oct 30, 2017
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
horovod now works on multigpu (#422)
parent
1a262e8c
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
20 additions
and
4 deletions
+20
-4
tensorpack/train/trainers.py
tensorpack/train/trainers.py
+20
-4
No files found.
tensorpack/train/trainers.py
View file @
74ca05dc
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
# File: trainers.py
# File: trainers.py
import
os
import
os
import
tensorflow
as
tf
from
..callbacks
import
RunOp
from
..callbacks
import
RunOp
from
..tfutils.sesscreate
import
NewSessionCreator
from
..tfutils.sesscreate
import
NewSessionCreator
...
@@ -213,7 +214,12 @@ class HorovodTrainer(SingleCostTrainer):
...
@@ -213,7 +214,12 @@ class HorovodTrainer(SingleCostTrainer):
def
__init__
(
self
):
def
__init__
(
self
):
hvd
.
init
()
hvd
.
init
()
self
.
is_chief
=
hvd
.
rank
()
==
0
self
.
is_chief
=
hvd
.
rank
()
==
0
logger
.
info
(
"Horovod local rank: {}"
.
format
(
hvd
.
local_rank
()))
local_rank
=
hvd
.
local_rank
()
devices
=
os
.
environ
[
'CUDA_VISIBLE_DEVICES'
]
devices
=
list
(
map
(
int
,
devices
.
split
(
','
)))
assert
len
(
devices
)
>=
local_rank
self
.
_device
=
devices
[
local_rank
]
logger
.
info
(
"Horovod local rank={}, device={}"
.
format
(
local_rank
,
self
.
_device
))
super
(
HorovodTrainer
,
self
)
.
__init__
()
super
(
HorovodTrainer
,
self
)
.
__init__
()
def
_setup_graph
(
self
,
input
,
get_cost_fn
,
get_opt_fn
):
def
_setup_graph
(
self
,
input
,
get_cost_fn
,
get_opt_fn
):
...
@@ -222,10 +228,20 @@ class HorovodTrainer(SingleCostTrainer):
...
@@ -222,10 +228,20 @@ class HorovodTrainer(SingleCostTrainer):
opt
=
get_opt_fn
()
opt
=
get_opt_fn
()
opt
=
hvd
.
DistributedOptimizer
(
opt
)
opt
=
hvd
.
DistributedOptimizer
(
opt
)
self
.
train_op
=
opt
.
apply_gradients
(
grads
,
name
=
'min_op'
)
self
.
train_op
=
opt
.
apply_gradients
(
grads
,
name
=
'min_op'
)
return
[
RunOp
(
cb
=
RunOp
(
hvd
.
broadcast_global_variables
(
0
),
tf
.
identity
(
hvd
.
broadcast_global_variables
(
0
),
name
=
'horovod_broadcast_global_variables'
),
run_before
=
True
,
run_before
=
True
,
run_as_trigger
=
False
,
verbose
=
True
)]
run_as_trigger
=
False
,
verbose
=
True
)
cb
.
chief_only
=
False
return
[
cb
]
def
initialize
(
self
,
session_creator
,
session_init
):
if
not
isinstance
(
session_creator
,
NewSessionCreator
):
raise
ValueError
(
"Cannot set session_creator for horovod training! "
)
session_creator
.
_config
.
gpu_options
.
visible_device_list
=
str
(
self
.
_device
)
super
(
HorovodTrainer
,
self
)
.
initialize
(
session_creator
,
session_init
)
from
..utils.develop
import
create_dummy_class
# noqa
from
..utils.develop
import
create_dummy_class
# noqa
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment