Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
18064a54
Commit
18064a54
authored
May 06, 2017
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Match sync training speed with tf/benchmarks (fix #254)
parent
f26b9b59
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
88 additions
and
36 deletions
+88
-36
tensorpack/models/batch_norm.py
tensorpack/models/batch_norm.py
+6
-2
tensorpack/tfutils/model_utils.py
tensorpack/tfutils/model_utils.py
+2
-1
tensorpack/tfutils/tower.py
tensorpack/tfutils/tower.py
+1
-3
tensorpack/train/multigpu.py
tensorpack/train/multigpu.py
+79
-30
No files found.
tensorpack/models/batch_norm.py
View file @
18064a54
...
@@ -126,8 +126,12 @@ def update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay):
...
@@ -126,8 +126,12 @@ def update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay):
add_model_variable
(
moving_var
)
add_model_variable
(
moving_var
)
# seems faster than delayed update, but might behave otherwise in distributed settings.
# seems faster than delayed update, but might behave otherwise in distributed settings.
with
tf
.
control_dependencies
([
update_op1
,
update_op2
]):
# TODO add an option, and maybe enable it for replica mode?
return
tf
.
identity
(
xn
,
name
=
'output'
)
# with tf.control_dependencies([update_op1, update_op2]):
# return tf.identity(xn, name='output')
tf
.
add_to_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
,
update_op1
)
tf
.
add_to_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
,
update_op2
)
return
xn
def
reshape_for_bn
(
param
,
ndims
,
chan
,
data_format
):
def
reshape_for_bn
(
param
,
ndims
,
chan
,
data_format
):
...
...
tensorpack/tfutils/model_utils.py
View file @
18064a54
...
@@ -28,7 +28,8 @@ def describe_model():
...
@@ -28,7 +28,8 @@ def describe_model():
table
=
tabulate
(
data
,
headers
=
[
'name'
,
'shape'
,
'dim'
])
table
=
tabulate
(
data
,
headers
=
[
'name'
,
'shape'
,
'dim'
])
size_mb
=
total
*
4
/
1024.0
**
2
size_mb
=
total
*
4
/
1024.0
**
2
summary_msg
=
colored
(
summary_msg
=
colored
(
"
\n
Total #param={} ({:.02f} MB assuming all float32)"
.
format
(
total
,
size_mb
),
'cyan'
)
"
\n
Total #vars={}, #param={} ({:.02f} MB assuming all float32)"
.
format
(
len
(
data
),
total
,
size_mb
),
'cyan'
)
logger
.
info
(
colored
(
"Model Parameters:
\n
"
,
'cyan'
)
+
table
+
summary_msg
)
logger
.
info
(
colored
(
"Model Parameters:
\n
"
,
'cyan'
)
+
table
+
summary_msg
)
...
...
tensorpack/tfutils/tower.py
View file @
18064a54
...
@@ -19,14 +19,12 @@ class TowerContext(object):
...
@@ -19,14 +19,12 @@ class TowerContext(object):
"""
"""
Args:
Args:
tower_name (str): 'tower0', 'towerp0', or ''
tower_name (str): 'tower0', 'towerp0', or ''
device (str): the device to use. Defaults to either cpu0 or gpu0.
device (str
or device function
): the device to use. Defaults to either cpu0 or gpu0.
is_training (bool): if None, automatically determine from tower_name.
is_training (bool): if None, automatically determine from tower_name.
"""
"""
self
.
_name
=
tower_name
self
.
_name
=
tower_name
if
device
is
None
:
if
device
is
None
:
device
=
'/gpu:0'
if
tf
.
test
.
is_gpu_available
()
else
'/cpu:0'
device
=
'/gpu:0'
if
tf
.
test
.
is_gpu_available
()
else
'/cpu:0'
assert
self
.
index
==
int
(
device
[
-
1
]),
\
"Tower name {} and device {} mismatch!"
.
format
(
self
.
_name
,
device
)
self
.
_device
=
device
self
.
_device
=
device
if
is_training
is
None
:
if
is_training
is
None
:
...
...
tensorpack/train/multigpu.py
View file @
18064a54
...
@@ -5,6 +5,7 @@
...
@@ -5,6 +5,7 @@
import
tensorflow
as
tf
import
tensorflow
as
tf
import
itertools
import
itertools
import
operator
import
re
import
re
from
six.moves
import
zip
,
range
from
six.moves
import
zip
,
range
...
@@ -25,25 +26,25 @@ __all__ = ['SyncMultiGPUTrainer', 'AsyncMultiGPUTrainer']
...
@@ -25,25 +26,25 @@ __all__ = ['SyncMultiGPUTrainer', 'AsyncMultiGPUTrainer']
class
MultiGPUTrainer
(
Trainer
):
class
MultiGPUTrainer
(
Trainer
):
""" Base class for multi-gpu training"""
""" Base class for multi-gpu training"""
@
staticmethod
@
staticmethod
def
multi_tower_grads
(
towers
,
get_tower_grad_func
):
def
build_on_multi_tower
(
towers
,
func
,
devices
=
None
):
""" ret[i] is a lists of (grad,var) tuple for tower i"""
"""
return
MultiGPUTrainer
.
_build_on_multi_tower
(
towers
,
get_tower_grad_func
)
Args:
towers: list of gpu relative ids
@
staticmethod
func: a lambda to be called inside each tower
def
multi_tower_costs
(
towers
,
get_tower_cost_func
):
devices: a list of devices to be used. By default will use GPUs in towers.
return
MultiGPUTrainer
.
_build_on_multi_tower
(
towers
,
get_tower_cost_func
)
"""
@
staticmethod
def
_build_on_multi_tower
(
towers
,
func
):
logger
.
info
(
"Training a model of {} tower"
.
format
(
len
(
towers
)))
logger
.
info
(
"Training a model of {} tower"
.
format
(
len
(
towers
)))
ret
=
[]
ret
=
[]
global_scope
=
tf
.
get_variable_scope
()
global_scope
=
tf
.
get_variable_scope
()
if
devices
is
not
None
:
assert
len
(
devices
)
==
len
(
towers
)
for
idx
,
t
in
enumerate
(
towers
):
for
idx
,
t
in
enumerate
(
towers
):
device
=
devices
[
idx
]
if
devices
is
not
None
else
'/gpu:{}'
.
format
(
t
)
with
tf
.
variable_scope
(
global_scope
,
reuse
=
idx
>
0
),
\
with
tf
.
variable_scope
(
global_scope
,
reuse
=
idx
>
0
),
\
TowerContext
(
TowerContext
(
'tower{}'
.
format
(
idx
),
'tower{}'
.
format
(
idx
),
device
=
'/gpu:{}'
.
format
(
t
)
,
device
=
device
,
is_training
=
True
):
is_training
=
True
):
logger
.
info
(
"Building graph for training tower {}..."
.
format
(
idx
))
logger
.
info
(
"Building graph for training tower {}..."
.
format
(
idx
))
...
@@ -56,17 +57,46 @@ class MultiGPUTrainer(Trainer):
...
@@ -56,17 +57,46 @@ class MultiGPUTrainer(Trainer):
return
ret
return
ret
class
SyncMultiGPUTrainer
(
MultiGPUTrainer
,
# Copied from https://github.com/tensorflow/benchmarks/blob/master/scripts/tf_cnn_benchmarks/variable_mgr.py
SingleCostFeedfreeTrainer
):
class
ParamServerDeviceSetter
(
object
):
"""Helper class to assign variables on the least loaded ps-device."""
def
__init__
(
self
,
worker_device
,
ps_devices
):
"""
Args:
worker_device: the device to use for computer ops.
ps_devices: a list of device to use for Variable ops. Each variable is
assigned to the least loaded device.
"""
self
.
ps_devices
=
ps_devices
self
.
worker_device
=
worker_device
self
.
ps_sizes
=
[
0
]
*
len
(
self
.
ps_devices
)
def
__call__
(
self
,
op
):
if
op
.
device
:
return
op
.
device
if
op
.
type
not
in
[
'Variable'
,
'VariableV2'
]:
return
self
.
worker_device
device_index
,
_
=
min
(
enumerate
(
self
.
ps_sizes
),
key
=
operator
.
itemgetter
(
1
))
device_name
=
self
.
ps_devices
[
device_index
]
var_size
=
op
.
outputs
[
0
]
.
get_shape
()
.
num_elements
()
self
.
ps_sizes
[
device_index
]
+=
var_size
return
device_name
class
SyncMultiGPUTrainerParameterServer
(
MultiGPUTrainer
,
SingleCostFeedfreeTrainer
):
"""
"""
A multi-tower multi-GPU trainer which synchronoizes the gradients computed
A multi-tower multi-GPU trainer which synchronoizes the gradients computed
from each tower
and averages them
.
from each tower
, averages them and update to variables stored on PS
.
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
,
ps_device
=
'gpu'
):
"""
"""
Args:
Args:
config: same as in :class:`QueueInputTrainer`.
config: same as in :class:`QueueInputTrainer`.
ps_device: either 'gpu' or 'cpu'
"""
"""
if
config
.
dataflow
is
not
None
:
if
config
.
dataflow
is
not
None
:
# use queueinput by default. May need to avoid this in the future (when more input type is available)
# use queueinput by default. May need to avoid this in the future (when more input type is available)
...
@@ -74,58 +104,77 @@ class SyncMultiGPUTrainer(MultiGPUTrainer,
...
@@ -74,58 +104,77 @@ class SyncMultiGPUTrainer(MultiGPUTrainer,
else
:
else
:
self
.
_input_method
=
config
.
data
self
.
_input_method
=
config
.
data
assert
len
(
config
.
tower
)
>=
1
,
"MultiGPUTrainer must be used with at least one tower."
if
len
(
config
.
tower
)
>
1
:
if
len
(
config
.
tower
)
>
1
:
assert
tf
.
test
.
is_gpu_available
()
assert
tf
.
test
.
is_gpu_available
()
#
doens't seem to improve on single GPU
#
seem to only improve on >1 GPUs
if
not
isinstance
(
self
.
_input_method
,
StagingInputWrapper
):
if
not
isinstance
(
self
.
_input_method
,
StagingInputWrapper
):
devices
=
[
'/gpu:{}'
.
format
(
k
)
for
k
in
config
.
tower
]
devices
=
[
'/gpu:{}'
.
format
(
k
)
for
k
in
config
.
tower
]
self
.
_input_method
=
StagingInputWrapper
(
self
.
_input_method
,
devices
)
self
.
_input_method
=
StagingInputWrapper
(
self
.
_input_method
,
devices
)
super
(
SyncMultiGPUTrainer
,
self
)
.
__init__
(
config
)
assert
ps_device
in
[
'gpu'
,
'cpu'
],
ps_device
self
.
_ps_device
=
ps_device
super
(
SyncMultiGPUTrainerParameterServer
,
self
)
.
__init__
(
config
)
@
staticmethod
@
staticmethod
def
_average_grads
(
tower_grads
):
def
_average_grads
(
tower_grads
):
if
len
(
tower_grads
)
==
1
:
nr_tower
=
len
(
tower_grads
)
if
nr_tower
==
1
:
return
tower_grads
[
0
]
return
tower_grads
[
0
]
ret
=
[]
new_tower_grads
=
[]
with
tf
.
name_scope
(
'AvgGrad'
):
with
tf
.
name_scope
(
'AvgGrad'
):
for
grad_and_vars
in
zip
(
*
tower_grads
):
for
grad_and_vars
in
zip
(
*
tower_grads
):
# Ngpu * 2
v
=
grad_and_vars
[
0
][
1
]
v
=
grad_and_vars
[
0
][
1
]
all_grad
=
[
k
[
0
]
for
k
in
grad_and_vars
]
all_grad
s
=
[
g
for
(
g
,
_
)
in
grad_and_vars
]
nones
=
list
(
set
(
all_grad
))
nones
=
list
(
set
(
all_grad
s
))
if
None
in
nones
and
len
(
nones
)
!=
1
:
if
None
in
nones
and
len
(
nones
)
!=
1
:
raise
RuntimeError
(
"Gradient w.r.t {} is None in some but not all towers!"
.
format
(
v
.
name
))
raise
RuntimeError
(
"Gradient w.r.t {} is None in some but not all towers!"
.
format
(
v
.
name
))
elif
nones
[
0
]
is
None
:
elif
nones
[
0
]
is
None
:
logger
.
warn
(
"No Gradient w.r.t {}"
.
format
(
v
.
op
.
name
))
logger
.
warn
(
"No Gradient w.r.t {}"
.
format
(
v
.
op
.
name
))
continue
continue
try
:
try
:
grad
=
tf
.
add_n
(
all_grad
)
/
float
(
len
(
tower_grads
))
with
tf
.
device
(
v
.
device
):
# colocate summed grad with var
grad
=
tf
.
multiply
(
tf
.
add_n
(
all_grads
),
1.0
/
nr_tower
)
except
:
except
:
logger
.
error
(
"Error while processing gradients of {}"
.
format
(
v
.
name
))
logger
.
error
(
"Error while processing gradients of {}"
.
format
(
v
.
name
))
raise
raise
ret
.
append
((
grad
,
v
))
new_tower_grads
.
append
((
grad
,
v
))
return
ret
return
new_tower_grads
def
_setup
(
self
):
def
_setup
(
self
):
super
(
SyncMultiGPUTrainer
,
self
)
.
_setup
()
super
(
SyncMultiGPUTrainer
ParameterServer
,
self
)
.
_setup
()
grad_list
=
MultiGPUTrainer
.
multi_tower_grads
(
raw_devices
=
[
'/gpu:{}'
.
format
(
k
)
for
k
in
self
.
config
.
tower
]
self
.
config
.
tower
,
lambda
:
self
.
_get_cost_and_grad
()[
1
])
if
self
.
_ps_device
==
'gpu'
:
devices
=
[
ParamServerDeviceSetter
(
d
,
raw_devices
)
for
d
in
raw_devices
]
else
:
devices
=
[
tf
.
train
.
replica_device_setter
(
worker_device
=
d
,
ps_device
=
'/cpu:0'
,
ps_tasks
=
1
)
for
d
in
raw_devices
]
grad_list
=
MultiGPUTrainer
.
build_on_multi_tower
(
self
.
config
.
tower
,
lambda
:
self
.
_get_cost_and_grad
()[
1
],
devices
)
# debug tower performance (without update):
# debug tower performance (without update):
# ops = [k[0] for k in grad_list[1]] + [k[0] for k in grad_list[0]]
# ops = [k[0] for k in grad_list[1]] + [k[0] for k in grad_list[0]]
# self.train_op = tf.group(*ops)
# self.train_op = tf.group(*ops)
# return
# return
grads
=
SyncMultiGPUTrainer
.
_average_grads
(
grad_list
)
grads
=
SyncMultiGPUTrainer
ParameterServer
.
_average_grads
(
grad_list
)
# grads = grad_list[0]
# grads = grad_list[0]
self
.
train_op
=
self
.
config
.
optimizer
.
apply_gradients
(
grads
,
name
=
'min_op'
)
self
.
train_op
=
self
.
config
.
optimizer
.
apply_gradients
(
grads
,
name
=
'min_op'
)
def
SyncMultiGPUTrainer
(
config
):
"""
Alias for ``SyncMultiGPUTrainerParameterServer(config, ps_device='gpu')``,
as this is the most commonly used synchronous multigpu trainer.
"""
return
SyncMultiGPUTrainerParameterServer
(
config
,
ps_device
=
'gpu'
)
class
AsyncMultiGPUTrainer
(
MultiGPUTrainer
,
class
AsyncMultiGPUTrainer
(
MultiGPUTrainer
,
SingleCostFeedfreeTrainer
):
SingleCostFeedfreeTrainer
):
"""
"""
...
@@ -155,7 +204,7 @@ class AsyncMultiGPUTrainer(MultiGPUTrainer,
...
@@ -155,7 +204,7 @@ class AsyncMultiGPUTrainer(MultiGPUTrainer,
def
_setup
(
self
):
def
_setup
(
self
):
super
(
AsyncMultiGPUTrainer
,
self
)
.
_setup
()
super
(
AsyncMultiGPUTrainer
,
self
)
.
_setup
()
grad_list
=
MultiGPUTrainer
.
multi_tower_grads
(
grad_list
=
MultiGPUTrainer
.
build_on_multi_tower
(
self
.
config
.
tower
,
lambda
:
self
.
_get_cost_and_grad
()[
1
])
self
.
config
.
tower
,
lambda
:
self
.
_get_cost_and_grad
()[
1
])
grad_list
=
[
FilterNoneGrad
()
.
process
(
gv
)
for
gv
in
grad_list
]
grad_list
=
[
FilterNoneGrad
()
.
process
(
gv
)
for
gv
in
grad_list
]
if
self
.
_scale_gradient
and
self
.
config
.
nr_tower
>
1
:
if
self
.
_scale_gradient
and
self
.
config
.
nr_tower
>
1
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment