Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
fe5d4984
Commit
fe5d4984
authored
Feb 02, 2018
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
average option in replicated trainers
parent
70c9ba8f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
26 additions
and
17 deletions
+26
-17
examples/ResNet/imagenet_utils.py
examples/ResNet/imagenet_utils.py
+1
-1
tensorpack/graph_builder/training.py
tensorpack/graph_builder/training.py
+6
-2
tensorpack/graph_builder/utils.py
tensorpack/graph_builder/utils.py
+4
-2
tensorpack/train/trainers.py
tensorpack/train/trainers.py
+15
-12
No files found.
examples/ResNet/imagenet_utils.py
View file @
fe5d4984
...
...
@@ -179,7 +179,7 @@ class ImageNetModel(ModelDesc):
def
_get_optimizer
(
self
):
lr
=
tf
.
get_variable
(
'learning_rate'
,
initializer
=
0.1
,
trainable
=
False
)
tf
.
summary
.
scalar
(
'learning_rate'
,
lr
)
tf
.
summary
.
scalar
(
'learning_rate
-summary
'
,
lr
)
return
tf
.
train
.
MomentumOptimizer
(
lr
,
0.9
,
use_nesterov
=
True
)
@
staticmethod
...
...
tensorpack/graph_builder/training.py
View file @
fe5d4984
...
...
@@ -175,12 +175,16 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
Data-parallel training in "replicated" mode,
where each GPU contains a replicate of the whole model.
It will build one tower on each GPU under its own variable scope.
Each gradient update is averaged across or GPUs through NCCL.
Each gradient update is averaged
or summed
across or GPUs through NCCL.
It is an equivalent of ``--variable_update=replicated`` in
`tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`_.
"""
def
__init__
(
self
,
towers
,
average
):
super
(
SyncMultiGPUReplicatedBuilder
,
self
)
.
__init__
(
towers
)
self
.
_average
=
average
def
build
(
self
,
get_grad_fn
,
get_opt_fn
):
"""
Args:
...
...
@@ -207,7 +211,7 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
DataParallelBuilder
.
_check_grad_list
(
grad_list
)
if
True
:
grads
=
allreduce_grads
(
grad_list
)
# #gpu x #param x 2
grads
=
allreduce_grads
(
grad_list
,
average
=
self
.
_average
)
# #gpu x #param x 2
else
:
agg_grad_and_vars
=
average_grads
(
grad_list
,
colocation
=
False
,
devices
=
[
'/cpu:0'
])
# #param x 2
grads
=
[]
# #gpu x #param x 2
...
...
tensorpack/graph_builder/utils.py
View file @
fe5d4984
...
...
@@ -96,13 +96,14 @@ class LeastLoadedDeviceSetter(object):
return
"LeastLoadedDeviceSetter-{}"
.
format
(
self
.
worker_device
)
def
allreduce_grads
(
all_grads
):
def
allreduce_grads
(
all_grads
,
average
):
"""
All-reduce average the gradients among devices. Results are broadcasted to all devices.
Args:
all_grads (K x N x 2): A list of K lists. Each of the list is a list of N (grad, var) tuples.
The variables have to be the same across the K lists.
average (bool): average gradients or not.
Returns:
(K x N x 2): same as input, but each grad is replaced by the average over K lists.
...
...
@@ -122,7 +123,8 @@ def allreduce_grads(all_grads):
for
(
_
,
v
),
g
in
zip
(
grad_and_vars
,
summed
):
with
tf
.
device
(
g
.
device
):
# tensorflow/benchmarks didn't average gradients
g
=
tf
.
multiply
(
g
,
1.0
/
nr_tower
)
if
average
:
g
=
tf
.
multiply
(
g
,
1.0
/
nr_tower
)
grads_for_a_var
.
append
((
g
,
v
))
new_all_grads
.
append
(
grads_for_a_var
)
...
...
tensorpack/train/trainers.py
View file @
fe5d4984
...
...
@@ -138,13 +138,14 @@ class SyncMultiGPUTrainerReplicated(SingleCostTrainer):
"""
@
map_arg
(
gpus
=
_int_to_range
)
def
__init__
(
self
,
gpus
):
def
__init__
(
self
,
gpus
,
average
=
True
):
"""
Args:
gpus ([int]): list of GPU ids.
gpus (int or [int]): list of GPU ids.
average (bool): whether to average or sum gradients.
"""
self
.
devices
=
gpus
self
.
_builder
=
SyncMultiGPUReplicatedBuilder
(
gpus
)
self
.
_builder
=
SyncMultiGPUReplicatedBuilder
(
gpus
,
average
)
super
(
SyncMultiGPUTrainerReplicated
,
self
)
.
__init__
()
def
_setup_graph
(
self
,
input
,
get_cost_fn
,
get_opt_fn
):
...
...
@@ -283,18 +284,21 @@ class HorovodTrainer(SingleCostTrainer):
(Add other environment variables you need by -x, e.g. PYTHONPATH, PATH)
Note:
1. Gradients are averaged among all processes.
2. If using all GPUs, you can always skip the `CUDA_VISIBLE_DEVICES` option.
1. If using all GPUs, you can always skip the `CUDA_VISIBLE_DEVICES` option.
3
. Due to the use of MPI, training is less informative (no progress bar).
2
. Due to the use of MPI, training is less informative (no progress bar).
4
. MPI often fails to kill all processes. Be sure to check it.
3
. MPI often fails to kill all processes. Be sure to check it.
"""
def
__init__
(
self
):
def
__init__
(
self
,
average
=
True
):
"""
Args:
average (bool): whether to average or sum the gradients across processes.
"""
hvd
.
init
()
self
.
is_chief
=
hvd
.
rank
()
==
0
self
.
_local_rank
=
hvd
.
local_rank
()
self
.
_average
=
average
logger
.
info
(
"Horovod local rank={}"
.
format
(
self
.
_local_rank
))
super
(
HorovodTrainer
,
self
)
.
__init__
()
...
...
@@ -306,7 +310,7 @@ class HorovodTrainer(SingleCostTrainer):
with
tf
.
name_scope
(
"HVDAllReduce"
):
for
grad
,
var
in
grads
:
if
grad
is
not
None
:
avg_grad
=
hvd
.
allreduce
(
grad
,
average
=
Tru
e
)
avg_grad
=
hvd
.
allreduce
(
grad
,
average
=
self
.
_averag
e
)
averaged_gradients
.
append
((
avg_grad
,
var
))
else
:
averaged_gradients
.
append
((
None
,
var
))
...
...
@@ -323,8 +327,7 @@ class HorovodTrainer(SingleCostTrainer):
op
=
hvd
.
broadcast_global_variables
(
0
)
cb
=
RunOp
(
op
,
run_before
=
True
,
run_as_trigger
=
False
,
verbose
=
True
)
cb
.
chief_only
=
False
run_as_trigger
=
False
,
verbose
=
True
)
.
set_chief_only
(
False
)
return
[
cb
]
@
HIDE_DOC
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment