Merge branch 'fpn'

a19eb489 · Yuxin Wu · 804c5ee6 · 4fe9e5b1 · a19eb489 · a19eb489
Commit a19eb489 authored May 22, 2018 by Yuxin Wu
7 changed files
--- a/examples/FasterRCNN/README.md
+++ b/examples/FasterRCNN/README.md
 # Faster-RCNN / Mask-RCNN on COCO
-This example aims to provide a minimal (1.3k lines) implementation of
+This example provides a minimal (only 1.6k lines) but faithful implementation the
-end-to-end Faster-RCNN & Mask-RCNN (with ResNet backbones) on COCO.
+following papers in combination:
+ [Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks](https://arxiv.org/abs/1506.01497)
+ [Feature Pyramid Networks for Object Detection](https://arxiv.org/abs/1612.03144)
+ [Mask R-CNN](https://arxiv.org/abs/1703.06870)
 ## Dependencies
-+ Python 3; TensorFlow >= 1.4.0
+ Python 3; TensorFlow >= 1.4.0 (>=1.6.0 recommended due to a TF bug);
 + [pycocotools](https://github.com/pdollar/coco/tree/master/PythonAPI/pycocotools), OpenCV.
 + Pre-trained [ResNet model](http://models.tensorpack.com/ResNet/) from tensorpack model zoo.
 + COCO data. It assumes the following directory structure:
@@ -53,18 +57,21 @@ MaskRCNN results contain both bbox and segm mAP.
 |Backbone|`FASTRCNN_BATCH`|resolution |schedule|mAP (bbox/segm)|Time          |
 |   -    |    -           |    -      |   -    |   -           |   -          |
-|R-50    |64              |(600, 1024)|280k    |33.1           |18h on 8 V100s|
+|R50-C4  |64              |(600, 1024)|280k    |33.1           |18h on 8 V100s|
-|R-50    |512             |(800, 1333)|280k    |35.6           |55h on 8 P100s|
+|R50-C4  |512             |(800, 1333)|280k    |35.6           |55h on 8 P100s|
-|R-50    |512             |(800, 1333)|360k    |36.6           |49h on 8 V100s|
+|R50-C4  |512             |(800, 1333)|360k    |36.6           |49h on 8 V100s|
-|R-50    |256             |(800, 1333)|280k    |36.8/32.1      |39h on 8 P100s|
+|R50-FPN |512             |(800, 1333)|360k    |37.5           |28h on 8 V100s|
-|R-50    |512							|(800, 1333)|360k    |37.8/33.2      |51h on 8 V100s|
+|R50-C4  |256             |(800, 1333)|280k    |36.8/32.1      |39h on 8 P100s|
-|R-101   |512             |(800, 1333)|280k    |40.1/34.4      |70h on 8 P100s|
+|R50-C4  |512							|(800, 1333)|360k    |37.8/33.2      |51h on 8 V100s|
-|R-101   |512             |(800, 1333)|360k    |40.8/35.1      |63h on 8 V100s|
+|R50-FPN |512							|(800, 1333)|360k    |38.1/34.9      |38h on 8 V100s|
+|R101-C4 |512             |(800, 1333)|280k    |40.1/34.4      |70h on 8 P100s|
+|R101-C4 |512             |(800, 1333)|360k    |40.8/35.1      |63h on 8 V100s|
-The two R-50 360k models have the same configuration __and mAP__ 
+The two R-50 360k models have the same configuration __and mAP__
 as the `R50-C4-2x` entries in
 [Detectron Model Zoo](https://github.com/facebookresearch/Detectron/blob/master/MODEL_ZOO.md#end-to-end-faster--mask-r-cnn-baselines).
-So far this seems to be the only open source re-implementation that can reproduce mAP in Detectron.
+So far this is the only TensorFlow implementation that can reproduce mAP in Detectron.
+The other models listed here do not correspond to any configurations in Detectron.
 ## Notes

--- a/examples/FasterRCNN/basemodel.py
+++ b/examples/FasterRCNN/basemodel.py
@@ -92,7 +92,7 @@ def resnet_group(l, name, block_func, features, count, stride):
    return l
-def pretrained_resnet_c4_backbone(image, num_blocks, freeze_c2=True):
+def resnet_c4_backbone(image, num_blocks, freeze_c2=True):
    assert len(num_blocks) == 3
    with resnet_argscope():
        l = tf.pad(image, [[0, 0], [0, 0], [2, 3], [2, 3]])
@@ -116,10 +116,19 @@ def resnet_conv5(image, num_block):
        return l
-def pretrained_resnet_fpn_backbone(image, num_blocks, freeze_c2=True):
+def resnet_fpn_backbone(image, num_blocks, freeze_c2=True):
+    shape2d = tf.shape(image)[2:]
+    mult = config.FPN_RESOLUTION_REQUIREMENT * 1.
+    new_shape2d = tf.to_int32(tf.ceil(tf.to_float(shape2d) / mult) * mult)
+    pad_shape2d = new_shape2d - shape2d
    assert len(num_blocks) == 4
+    # TODO pad 1 at each stage
    with resnet_argscope():
-        l = tf.pad(image, [[0, 0], [0, 0], [2, 3], [2, 3]])
+        chan = image.shape[1]
+        l = tf.pad(image,
+                   tf.stack([[0, 0], [0, 0],
+                            [2, 3 + pad_shape2d[0]], [2, 3 + pad_shape2d[1]]]))
+        l.set_shape([None, chan, None, None])
        l = Conv2D('conv0', l, 64, 7, strides=2, activation=BNReLU, padding='VALID')
        l = tf.pad(l, [[0, 0], [0, 0], [0, 1], [0, 1]])
        l = MaxPooling('pool0', l, 3, strides=2, padding='VALID')

--- a/examples/FasterRCNN/config.py
+++ b/examples/FasterRCNN/config.py
@@ -5,6 +5,7 @@ import numpy as np
 # mode flags ---------------------
 MODE_MASK = True
+MODE_FPN = False
 # dataset -----------------------
 BASEDIR = '/path/to/your/COCO/DIR'
@@ -34,6 +35,7 @@ MAX_SIZE = 1333
 # anchors -------------------------
 ANCHOR_STRIDE = 16
 ANCHOR_STRIDES_FPN = (4, 8, 16, 32, 64)  # strides for each FPN level. Must be the same length as ANCHOR_SIZES
+FPN_RESOLUTION_REQUIREMENT = 32    # image size into the backbone has to be multiple of this number
 ANCHOR_SIZES = (32, 64, 128, 256, 512)   # sqrtarea of the anchor box
 ANCHOR_RATIOS = (0.5, 1., 2.)
 NUM_ANCHOR = len(ANCHOR_SIZES) * len(ANCHOR_RATIOS)
@@ -48,6 +50,7 @@ RPN_MIN_SIZE = 0
 RPN_PROPOSAL_NMS_THRESH = 0.7
 TRAIN_PRE_NMS_TOPK = 12000
 TRAIN_POST_NMS_TOPK = 2000
+TRAIN_FPN_NMS_TOPK = 2000
 CROWD_OVERLAP_THRES = 0.7  # boxes overlapping crowd will be ignored.
 # fastrcnn training ---------------------
@@ -56,15 +59,16 @@ FASTRCNN_BBOX_REG_WEIGHTS = np.array([10, 10, 5, 5], dtype='float32')
 FASTRCNN_FG_THRESH = 0.5
 FASTRCNN_FG_RATIO = 0.25  # fg ratio in a ROI batch
+# modeling -------------------------
+FPN_NUM_CHANNEL = 256
+FASTRCNN_FC_HEAD_DIM = 1024
+MASKRCNN_HEAD_DIM = 256
 # testing -----------------------
 TEST_PRE_NMS_TOPK = 6000
 TEST_POST_NMS_TOPK = 1000   # if you encounter OOM in inference, set this to a smaller number
+TEST_FPN_NMS_TOPK = 1000
 FASTRCNN_NMS_THRESH = 0.5
 RESULT_SCORE_THRESH = 0.05
 RESULT_SCORE_THRESH_VIS = 0.3   # only visualize confident results
 RESULTS_PER_IM = 100
-# TODO Not Functioning. Don't USE
-MODE_FPN = False
-FPN_NUM_CHANNEL = 256
-FPN_SIZE_REQUIREMENT = 32
--- a/examples/FasterRCNN/data.py
+++ b/examples/FasterRCNN/data.py
@@ -8,7 +8,7 @@ import itertools
 from tensorpack.utils.argtools import memoized, log_once
 from tensorpack.dataflow import (
-    imgaug, TestDataSpeed, PrefetchDataZMQ, MapData,
+    imgaug, TestDataSpeed, PrefetchDataZMQ, MultiProcessMapDataZMQ,
    MapDataComponent, DataFromList)
 # import tensorpack.utils.viz as tpviz
@@ -51,7 +51,12 @@ def get_all_anchors(
    # anchors are intbox here.
    # anchors at featuremap [0,0] are centered at fpcoor (8,8) (half of stride)
-    field_size = int(np.ceil(config.MAX_SIZE / stride))
+    max_size = config.MAX_SIZE
+    if config.MODE_FPN:
+        # TODO setting this in config is perhaps better
+        size_mult = config.FPN_RESOLUTION_REQUIREMENT * 1.
+        max_size = np.ceil(max_size / size_mult) * size_mult
+    field_size = int(np.ceil(max_size / stride))
    shifts = np.arange(0, field_size) * stride
    shift_x, shift_y = np.meshgrid(shifts, shifts)
    shift_x = shift_x.flatten()
@@ -136,17 +141,19 @@ def get_anchor_labels(anchors, gt_boxes, crowd_boxes):
        overlap_with_crowd = cand_inds[ious.max(axis=1) > config.CROWD_OVERLAP_THRES]
        anchor_labels[overlap_with_crowd] = -1
-    # Filter fg labels: ignore some fg if fg is too many
+    # Subsample fg labels: ignore some fg if fg is too many
    target_num_fg = int(config.RPN_BATCH_PER_IM * config.RPN_FG_RATIO)
    fg_inds = filter_box_label(anchor_labels, 1, target_num_fg)
+    if len(fg_inds) == 0:
+        raise MalformedData("No valid foreground for RPN!")
    # Note that fg could be fewer than the target ratio
-    # filter bg labels. num_bg is not allowed to be too many
+    # Subsample bg labels. num_bg is not allowed to be too many
    old_num_bg = np.sum(anchor_labels == 0)
-    if old_num_bg == 0 or len(fg_inds) == 0:
+    if old_num_bg == 0:
        # No valid bg/fg in this image, skip.
        # This can happen if, e.g. the image has large crowd.
-        raise MalformedData("No valid foreground/background for RPN!")
+        raise MalformedData("No valid background for RPN!")
    target_num_bg = config.RPN_BATCH_PER_IM - len(fg_inds)
    filter_box_label(anchor_labels, 0, target_num_bg)   # ignore return values
@@ -336,8 +343,7 @@ def get_train_dataflow(add_mask=False):
            # tpviz.interactive_imshow(viz)
        return ret
-    ds = MapData(ds, preprocess)
+    ds = MultiProcessMapDataZMQ(ds, 10, preprocess)
-    ds = PrefetchDataZMQ(ds, 1)
    return ds
@@ -359,7 +365,6 @@ if __name__ == '__main__':
    import os
    from tensorpack.dataflow import PrintData
    config.BASEDIR = os.path.expanduser('~/data/coco')
-    config.TRAIN_DATASET = ['train2014']
    ds = get_train_dataflow(add_mask=config.MODE_MASK)
    ds = PrintData(ds, 100)
    TestDataSpeed(ds, 50000).start()

--- a/examples/FasterRCNN/eval.py
+++ b/examples/FasterRCNN/eval.py
@@ -141,12 +141,15 @@ def print_evaluation_scores(json_file):
    cocoEval.evaluate()
    cocoEval.accumulate()
    cocoEval.summarize()
-    ret['mAP(bbox)'] = cocoEval.stats[0]
+    fields = ['IoU=0.5:0.95', 'IoU=0.5', 'IoU=0.75', 'small', 'medium', 'large']
+    for k in range(6):
+        ret['mAP(bbox)/' + fields[k]] = cocoEval.stats[k]
    if config.MODE_MASK:
        cocoEval = COCOeval(coco, cocoDt, 'segm')
        cocoEval.evaluate()
        cocoEval.accumulate()
        cocoEval.summarize()
-        ret['mAP(segm)'] = cocoEval.stats[0]
+        for k in range(6):
+            ret['mAP(segm)/' + fields[k]] = cocoEval.stats[k]
    return ret
--- a/examples/FasterRCNN/model.py
+++ b/examples/FasterRCNN/model.py
--- a/examples/FasterRCNN/train.py
+++ b/examples/FasterRCNN/train.py