Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
820bcac1
Commit
820bcac1
authored
Jun 17, 2018
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[MaskRCNN] support horovod
parent
456f5675
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
43 additions
and
19 deletions
+43
-19
examples/FasterRCNN/config.py
examples/FasterRCNN/config.py
+1
-0
examples/FasterRCNN/data.py
examples/FasterRCNN/data.py
+9
-2
examples/FasterRCNN/train.py
examples/FasterRCNN/train.py
+33
-17
No files found.
examples/FasterRCNN/config.py
View file @
820bcac1
...
@@ -4,6 +4,7 @@
...
@@ -4,6 +4,7 @@
import
numpy
as
np
import
numpy
as
np
# mode flags ---------------------
# mode flags ---------------------
TRAINER
=
'replicated'
# options: 'horovod', 'replicated'
MODE_MASK
=
True
MODE_MASK
=
True
MODE_FPN
=
False
MODE_FPN
=
False
...
...
examples/FasterRCNN/data.py
View file @
820bcac1
...
@@ -8,7 +8,8 @@ import itertools
...
@@ -8,7 +8,8 @@ import itertools
from
tensorpack.utils.argtools
import
memoized
,
log_once
from
tensorpack.utils.argtools
import
memoized
,
log_once
from
tensorpack.dataflow
import
(
from
tensorpack.dataflow
import
(
imgaug
,
TestDataSpeed
,
PrefetchDataZMQ
,
MultiProcessMapDataZMQ
,
imgaug
,
TestDataSpeed
,
PrefetchDataZMQ
,
MultiProcessMapDataZMQ
,
MultiThreadMapData
,
MapDataComponent
,
DataFromList
)
MapDataComponent
,
DataFromList
)
from
tensorpack.utils
import
logger
from
tensorpack.utils
import
logger
# import tensorpack.utils.viz as tpviz
# import tensorpack.utils.viz as tpviz
...
@@ -353,7 +354,13 @@ def get_train_dataflow():
...
@@ -353,7 +354,13 @@ def get_train_dataflow():
# tpviz.interactive_imshow(viz)
# tpviz.interactive_imshow(viz)
return
ret
return
ret
ds
=
MultiProcessMapDataZMQ
(
ds
,
10
,
preprocess
)
if
config
.
TRAINER
==
'horovod'
:
ds
=
MultiThreadMapData
(
ds
,
5
,
preprocess
)
# MPI does not like fork(), but we use it for speed anyway.
# We only fork once here, which seems to work fine.
ds
=
PrefetchDataZMQ
(
ds
,
1
)
else
:
ds
=
MultiProcessMapDataZMQ
(
ds
,
10
,
preprocess
)
return
ds
return
ds
...
...
examples/FasterRCNN/train.py
View file @
820bcac1
...
@@ -94,7 +94,7 @@ class DetectionModel(ModelDesc):
...
@@ -94,7 +94,7 @@ class DetectionModel(ModelDesc):
def
optimizer
(
self
):
def
optimizer
(
self
):
lr
=
tf
.
get_variable
(
'learning_rate'
,
initializer
=
0.003
,
trainable
=
False
)
lr
=
tf
.
get_variable
(
'learning_rate'
,
initializer
=
0.003
,
trainable
=
False
)
tf
.
summary
.
scalar
(
'learning_rate'
,
lr
)
tf
.
summary
.
scalar
(
'learning_rate
-summary
'
,
lr
)
factor
=
get_batch_factor
()
factor
=
get_batch_factor
()
if
factor
!=
1
:
if
factor
!=
1
:
...
@@ -586,7 +586,15 @@ if __name__ == '__main__':
...
@@ -586,7 +586,15 @@ if __name__ == '__main__':
COCODetection
(
config
.
BASEDIR
,
'val2014'
)
# Only to load the class names into caches
COCODetection
(
config
.
BASEDIR
,
'val2014'
)
# Only to load the class names into caches
predict
(
pred
,
args
.
predict
)
predict
(
pred
,
args
.
predict
)
else
:
else
:
logger
.
set_logger_dir
(
args
.
logdir
)
os
.
environ
[
'TF_AUTOTUNE_THRESHOLD'
]
=
'1'
is_horovod
=
config
.
TRAINER
==
'horovod'
if
is_horovod
:
import
horovod.tensorflow
as
hvd
hvd
.
init
()
logger
.
info
(
"Horovod Rank={}, Size={}"
.
format
(
hvd
.
rank
(),
hvd
.
size
()))
if
not
is_horovod
or
hvd
.
rank
()
==
0
:
logger
.
set_logger_dir
(
args
.
logdir
,
'd'
)
print_config
()
print_config
()
factor
=
get_batch_factor
()
factor
=
get_batch_factor
()
stepnum
=
config
.
STEPS_PER_EPOCH
stepnum
=
config
.
STEPS_PER_EPOCH
...
@@ -600,27 +608,35 @@ if __name__ == '__main__':
...
@@ -600,27 +608,35 @@ if __name__ == '__main__':
lr_schedule
.
append
(
lr_schedule
.
append
(
(
steps
*
factor
//
stepnum
,
config
.
BASE_LR
*
mult
))
(
steps
*
factor
//
stepnum
,
config
.
BASE_LR
*
mult
))
callbacks
=
[
PeriodicCallback
(
ModelSaver
(
max_to_keep
=
10
,
keep_checkpoint_every_n_hours
=
1
),
every_k_epochs
=
20
),
# linear warmup
ScheduledHyperParamSetter
(
'learning_rate'
,
warmup_schedule
,
interp
=
'linear'
,
step_based
=
True
),
ScheduledHyperParamSetter
(
'learning_rate'
,
lr_schedule
),
EvalCallback
(),
PeakMemoryTracker
(),
EstimatedTimeLeft
(),
]
if
not
is_horovod
:
callbacks
.
extend
([
GPUUtilizationTracker
(),
SessionRunTimeout
(
60000
),
# 1 minute timeout
])
cfg
=
TrainConfig
(
cfg
=
TrainConfig
(
model
=
get_model
(),
model
=
get_model
(),
data
=
QueueInput
(
get_train_dataflow
()),
data
=
QueueInput
(
get_train_dataflow
()),
callbacks
=
[
callbacks
=
callbacks
,
PeriodicCallback
(
ModelSaver
(
max_to_keep
=
10
,
keep_checkpoint_every_n_hours
=
1
),
every_k_epochs
=
20
),
# linear warmup
ScheduledHyperParamSetter
(
'learning_rate'
,
warmup_schedule
,
interp
=
'linear'
,
step_based
=
True
),
ScheduledHyperParamSetter
(
'learning_rate'
,
lr_schedule
),
EvalCallback
(),
GPUUtilizationTracker
(),
PeakMemoryTracker
(),
EstimatedTimeLeft
(),
SessionRunTimeout
(
60000
),
# 1 minute timeout
],
steps_per_epoch
=
stepnum
,
steps_per_epoch
=
stepnum
,
max_epoch
=
config
.
LR_SCHEDULE
[
-
1
]
*
factor
//
stepnum
,
max_epoch
=
config
.
LR_SCHEDULE
[
-
1
]
*
factor
//
stepnum
,
session_init
=
get_model_loader
(
args
.
load
)
if
args
.
load
else
None
,
session_init
=
get_model_loader
(
args
.
load
)
if
args
.
load
else
None
,
)
)
# nccl mode gives the best speed
# nccl mode gives the best speed
trainer
=
SyncMultiGPUTrainerReplicated
(
get_nr_gpu
(),
mode
=
'nccl'
)
if
is_horovod
:
trainer
=
HorovodTrainer
()
else
:
trainer
=
SyncMultiGPUTrainerReplicated
(
get_nr_gpu
(),
mode
=
'nccl'
)
launch_train_with_config
(
cfg
,
trainer
)
launch_train_with_config
(
cfg
,
trainer
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment