Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
e1a879d7
Commit
e1a879d7
authored
Feb 08, 2018
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Make "builder.grads" visible to trainers.
parent
9bbdf94d
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
27 additions
and
17 deletions
+27
-17
.travis.yml
.travis.yml
+8
-6
tensorpack/graph_builder/training.py
tensorpack/graph_builder/training.py
+17
-9
tensorpack/train/trainers.py
tensorpack/train/trainers.py
+2
-2
No files found.
.travis.yml
View file @
e1a879d7
...
@@ -8,10 +8,6 @@ cache:
...
@@ -8,10 +8,6 @@ cache:
directories
:
directories
:
-
$HOME/tensorpack_data
-
$HOME/tensorpack_data
env
:
global
:
-
TF_VERSION=1.5.0
addons
:
addons
:
apt
:
apt
:
packages
:
packages
:
...
@@ -24,10 +20,16 @@ matrix:
...
@@ -24,10 +20,16 @@ matrix:
include
:
include
:
-
os
:
linux
-
os
:
linux
python
:
2.7
python
:
2.7
env
:
TF_TYPE=release
env
:
TF_VERSION=1.3.0 TF_TYPE=release
-
os
:
linux
python
:
3.5
env
:
TF_VERSION=1.3.0 TF_TYPE=release
-
os
:
linux
python
:
2.7
env
:
TF_VERSION=1.5.0 TF_TYPE=release
-
os
:
linux
-
os
:
linux
python
:
3.5
python
:
3.5
env
:
TF_TYPE=release
env
:
TF_
VERSION=1.5.0 TF_
TYPE=release
-
os
:
linux
-
os
:
linux
python
:
2.7
python
:
2.7
env
:
TF_VERSION=1.head TF_TYPE=nightly
env
:
TF_VERSION=1.head TF_TYPE=nightly
...
...
tensorpack/graph_builder/training.py
View file @
e1a879d7
...
@@ -123,6 +123,9 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder):
...
@@ -123,6 +123,9 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder):
It is an equivalent of ``--variable_update=parameter_server`` in
It is an equivalent of ``--variable_update=parameter_server`` in
`tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`_.
`tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`_.
Attribute:
grads: list of (g, v). Averaged gradients, available after build()
"""
"""
def
__init__
(
self
,
towers
,
ps_device
):
def
__init__
(
self
,
towers
,
ps_device
):
"""
"""
...
@@ -158,15 +161,15 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder):
...
@@ -158,15 +161,15 @@ class SyncMultiGPUParameterServerBuilder(DataParallelBuilder):
# self.train_op = tf.group(*ops)
# self.train_op = tf.group(*ops)
# return
# return
grads
=
average_grads
(
grad_list
,
colocation
=
True
)
self
.
grads
=
average_grads
(
grad_list
,
colocation
=
True
)
# grads = grad_list[0]
# grads = grad_list[0]
opt
=
get_opt_fn
()
opt
=
get_opt_fn
()
if
self
.
ps_device
==
'cpu'
:
if
self
.
ps_device
==
'cpu'
:
with
tf
.
device
(
'/cpu:0'
):
with
tf
.
device
(
'/cpu:0'
):
train_op
=
opt
.
apply_gradients
(
grads
,
name
=
'train_op'
)
train_op
=
opt
.
apply_gradients
(
self
.
grads
,
name
=
'train_op'
)
else
:
else
:
train_op
=
opt
.
apply_gradients
(
grads
,
name
=
'train_op'
)
train_op
=
opt
.
apply_gradients
(
self
.
grads
,
name
=
'train_op'
)
return
train_op
return
train_op
...
@@ -179,11 +182,16 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
...
@@ -179,11 +182,16 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
It is an equivalent of ``--variable_update=replicated`` in
It is an equivalent of ``--variable_update=replicated`` in
`tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`_.
`tensorflow/benchmarks <https://github.com/tensorflow/benchmarks>`_.
Attribute:
grads: #GPU number of lists of (g, v). Synchronized gradients on each device, available after build()
Though on different deviecs, they should contain the same value.
"""
"""
def
__init__
(
self
,
towers
,
average
):
def
__init__
(
self
,
towers
,
average
,
use_nccl
):
super
(
SyncMultiGPUReplicatedBuilder
,
self
)
.
__init__
(
towers
)
super
(
SyncMultiGPUReplicatedBuilder
,
self
)
.
__init__
(
towers
)
self
.
_average
=
average
self
.
_average
=
average
self
.
_use_nccl
=
use_nccl
def
build
(
self
,
get_grad_fn
,
get_opt_fn
):
def
build
(
self
,
get_grad_fn
,
get_opt_fn
):
"""
"""
...
@@ -210,20 +218,20 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
...
@@ -210,20 +218,20 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
DataParallelBuilder
.
_check_grad_list
(
grad_list
)
DataParallelBuilder
.
_check_grad_list
(
grad_list
)
if
True
:
if
self
.
_use_nccl
:
grads
=
allreduce_grads
(
grad_list
,
average
=
self
.
_average
)
# #gpu x #param x 2
self
.
grads
=
allreduce_grads
(
grad_list
,
average
=
self
.
_average
)
# #gpu x #param x 2
else
:
else
:
agg_grad_and_vars
=
average_grads
(
grad_list
,
colocation
=
False
,
devices
=
[
'/cpu:0'
])
# #param x 2
agg_grad_and_vars
=
average_grads
(
grad_list
,
colocation
=
False
,
devices
=
[
'/cpu:0'
])
# #param x 2
grads
=
[]
# #gpu x #param x 2
self
.
grads
=
[]
# #gpu x #param x 2
for
grad_and_vars
in
grad_list
:
# grad_and_vars: #paramx2
for
grad_and_vars
in
grad_list
:
# grad_and_vars: #paramx2
# take v from each tower, and g from average.
# take v from each tower, and g from average.
grads
.
append
(
self
.
grads
.
append
(
[(
g
,
v
)
for
(
_
,
v
),
(
g
,
_
)
in
zip
(
grad_and_vars
,
agg_grad_and_vars
)])
[(
g
,
v
)
for
(
_
,
v
),
(
g
,
_
)
in
zip
(
grad_and_vars
,
agg_grad_and_vars
)])
train_ops
=
[]
train_ops
=
[]
opt
=
get_opt_fn
()
opt
=
get_opt_fn
()
with
tf
.
name_scope
(
'apply_gradients'
):
with
tf
.
name_scope
(
'apply_gradients'
):
for
idx
,
grad_and_vars
in
enumerate
(
grads
):
for
idx
,
grad_and_vars
in
enumerate
(
self
.
grads
):
with
tf
.
device
(
raw_devices
[
idx
]):
with
tf
.
device
(
raw_devices
[
idx
]):
# apply_gradients may create variables. Make them LOCAL_VARIABLES
# apply_gradients may create variables. Make them LOCAL_VARIABLES
with
override_to_local_variable
(
enable
=
idx
>
0
):
with
override_to_local_variable
(
enable
=
idx
>
0
):
...
...
tensorpack/train/trainers.py
View file @
e1a879d7
...
@@ -138,14 +138,14 @@ class SyncMultiGPUTrainerReplicated(SingleCostTrainer):
...
@@ -138,14 +138,14 @@ class SyncMultiGPUTrainerReplicated(SingleCostTrainer):
"""
"""
@
map_arg
(
gpus
=
_int_to_range
)
@
map_arg
(
gpus
=
_int_to_range
)
def
__init__
(
self
,
gpus
,
average
=
True
):
def
__init__
(
self
,
gpus
,
average
=
True
,
use_nccl
=
True
):
"""
"""
Args:
Args:
gpus (int or [int]): list of GPU ids.
gpus (int or [int]): list of GPU ids.
average (bool): whether to average or sum gradients.
average (bool): whether to average or sum gradients.
"""
"""
self
.
devices
=
gpus
self
.
devices
=
gpus
self
.
_builder
=
SyncMultiGPUReplicatedBuilder
(
gpus
,
average
)
self
.
_builder
=
SyncMultiGPUReplicatedBuilder
(
gpus
,
average
,
use_nccl
)
super
(
SyncMultiGPUTrainerReplicated
,
self
)
.
__init__
()
super
(
SyncMultiGPUTrainerReplicated
,
self
)
.
__init__
()
def
_setup_graph
(
self
,
input
,
get_cost_fn
,
get_opt_fn
):
def
_setup_graph
(
self
,
input
,
get_cost_fn
,
get_opt_fn
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment