Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
seminar-breakout
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Shashank Suhas
seminar-breakout
Commits
c34a3501
Commit
c34a3501
authored
Aug 02, 2018
by
Yuxin Wu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[MaskRCNN] fix warmup schedule for distributed training
parent
3e0515c2
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
14 additions
and
12 deletions
+14
-12
examples/FasterRCNN/config.py
examples/FasterRCNN/config.py
+5
-4
examples/FasterRCNN/train.py
examples/FasterRCNN/train.py
+9
-8
No files found.
examples/FasterRCNN/config.py
View file @
c34a3501
...
...
@@ -84,13 +84,14 @@ _C.BACKBONE.TF_PAD_MODE = False
_C
.
BACKBONE
.
STRIDE_1X1
=
False
# True for MSRA models
# schedule -----------------------
# The schedule and learning rate here is defined for a total batch size of 8.
# If not running with 8 GPUs, they will be adjusted automatically in code.
_C
.
TRAIN
.
NUM_GPUS
=
None
# by default, will be set from code
_C
.
TRAIN
.
WEIGHT_DECAY
=
1e-4
_C
.
TRAIN
.
BASE_LR
=
1e-2
_C
.
TRAIN
.
WARMUP
=
1000
# in step
s
_C
.
TRAIN
.
BASE_LR
=
1e-2
# defined for a total batch size of 8. Otherwise it will be adjusted automatically
_C
.
TRAIN
.
WARMUP
=
1000
# in terms of iterations. This is not affected by #GPU
s
_C
.
TRAIN
.
STEPS_PER_EPOCH
=
500
# Schedule means "steps" only when total batch size is 8.
# Otherwise the actual steps to decrease learning rate are computed from the schedule.
# LR_SCHEDULE = [120000, 160000, 180000] # "1x" schedule in detectron
_C
.
TRAIN
.
LR_SCHEDULE
=
[
240000
,
320000
,
360000
]
# "2x" schedule in detectron
...
...
examples/FasterRCNN/train.py
View file @
c34a3501
...
...
@@ -65,9 +65,8 @@ class DetectionModel(ModelDesc):
lr
=
tf
.
get_variable
(
'learning_rate'
,
initializer
=
0.003
,
trainable
=
False
)
tf
.
summary
.
scalar
(
'learning_rate-summary'
,
lr
)
factor
=
cfg
.
TRAIN
.
NUM_GPUS
/
8.
if
factor
!=
1
:
lr
=
lr
*
factor
# The learning rate is set for 8 GPUs, and we use trainers with average=False.
lr
=
lr
/
8.
opt
=
tf
.
train
.
MomentumOptimizer
(
lr
,
0.9
)
if
cfg
.
TRAIN
.
NUM_GPUS
<
8
:
opt
=
optimizer
.
AccumGradOptimizer
(
opt
,
8
//
cfg
.
TRAIN
.
NUM_GPUS
)
...
...
@@ -242,7 +241,7 @@ class ResNetC4Model(DetectionModel):
mrcnn_loss
,
wd_cost
],
'total_cost'
)
add_moving_summary
(
total_cost
,
wd_cost
)
return
total_cost
*
(
1.
/
cfg
.
TRAIN
.
NUM_GPUS
)
return
total_cost
else
:
final_boxes
,
final_labels
=
self
.
fastrcnn_inference
(
image_shape2d
,
rcnn_boxes
,
fastrcnn_label_logits
,
fastrcnn_box_logits
)
...
...
@@ -378,7 +377,7 @@ class ResNetFPNModel(DetectionModel):
mrcnn_loss
,
wd_cost
],
'total_cost'
)
add_moving_summary
(
total_cost
,
wd_cost
)
return
total_cost
*
(
1.
/
cfg
.
TRAIN
.
NUM_GPUS
)
return
total_cost
else
:
final_boxes
,
final_labels
=
self
.
fastrcnn_inference
(
image_shape2d
,
rcnn_boxes
,
fastrcnn_label_logits
,
fastrcnn_box_logits
)
...
...
@@ -553,13 +552,15 @@ if __name__ == '__main__':
logger
.
set_logger_dir
(
args
.
logdir
,
'd'
)
finalize_configs
(
is_training
=
True
)
factor
=
8.
/
cfg
.
TRAIN
.
NUM_GPUS
stepnum
=
cfg
.
TRAIN
.
STEPS_PER_EPOCH
# warmup is step based, lr is epoch based
warmup_schedule
=
[(
0
,
cfg
.
TRAIN
.
BASE_LR
/
3
),
(
cfg
.
TRAIN
.
WARMUP
*
factor
,
cfg
.
TRAIN
.
BASE_LR
)]
warmup_end_epoch
=
cfg
.
TRAIN
.
WARMUP
*
factor
*
1.
/
stepnum
init_lr
=
cfg
.
TRAIN
.
BASE_LR
*
0.33
*
(
8.
/
cfg
.
TRAIN
.
NUM_GPUS
)
warmup_schedule
=
[(
0
,
init_lr
),
(
cfg
.
TRAIN
.
WARMUP
,
cfg
.
TRAIN
.
BASE_LR
)]
warmup_end_epoch
=
cfg
.
TRAIN
.
WARMUP
*
1.
/
stepnum
lr_schedule
=
[(
int
(
np
.
ceil
(
warmup_end_epoch
)),
warmup_schedule
[
-
1
][
1
])]
factor
=
8.
/
cfg
.
TRAIN
.
NUM_GPUS
for
idx
,
steps
in
enumerate
(
cfg
.
TRAIN
.
LR_SCHEDULE
[:
-
1
]):
mult
=
0.1
**
(
idx
+
1
)
lr_schedule
.
append
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment