[MaskRCNN] use un-quantized anchors; use better postprocessing; use 1x schedule

141ab53c · Yuxin Wu · ae7b0774 · 141ab53c · 141ab53c · 141ab53c
Commit 141ab53c authored May 21, 2019 by Yuxin Wu
7 changed files
--- a/examples/FasterRCNN/NOTES.md
+++ b/examples/FasterRCNN/NOTES.md
@@ -54,11 +54,8 @@ Model:
 4. Because of (3), BatchNorm statistics are supposed to be freezed during fine-tuning.

 5. An alternative to freezing BatchNorm is to sync BatchNorm statistics across
-   GPUs (the `BACKBONE.NORM=SyncBN` option). This would require [my bugfix](https://github.com/tensorflow/tensorflow/pull/20360)
-   which is available since TF 1.10. You can manually apply the patch to use it.
-   For now the total batch size is at most 8, so this option does not improve the model by much.
-
-6. Another alternative to BatchNorm is GroupNorm (`BACKBONE.NORM=GN`) which has better performance.
+   GPUs (the `BACKBONE.NORM=SyncBN` option).
+   Another alternative to BatchNorm is GroupNorm (`BACKBONE.NORM=GN`) which has better performance.

 Efficiency:

@@ -74,14 +71,22 @@ Efficiency:

 1. This implementation does not use specialized CUDA ops (e.g. AffineChannel, ROIAlign).
   Therefore it might be slower than other highly-optimized implementations.
-   
+
 1. To reduce RAM usage on host: (1) make sure you're using the "spawn" method as
   set in `train.py`; (2) reduce `buffer_size` or `NUM_WORKERS` in `data.py`
   (which may negatively impact your throughput). The training needs <10G RAM if `NUM_WORKERS=0`.
-   
+
 1. Inference is unoptimized. Tensorpack is a training interface, therefore it
   does not help you on optimized inference.

+1. To reduce RAM usage on host: (1) make sure you're using the "spawn" method as
+   set in `train.py`; (2) reduce `buffer_size` or `NUM_WORKERS` in `data.py`
+   (which may negatively impact your throughput). The training needs <10G RAM if `NUM_WORKERS=0`.
+
+1. Inference is unoptimized. Tensorpack is a training interface, therefore it
+   does not help you on optimized inference. In fact, the current implementation
+   uses some slow numpy operations in inference (in `eval.py:_paste_mask`).
+
 Possible Future Enhancements:

 1. Define a better interface to load different datasets.

--- a/examples/FasterRCNN/README.md
+++ b/examples/FasterRCNN/README.md
--- a/examples/FasterRCNN/config.py
+++ b/examples/FasterRCNN/config.py
@@ -130,8 +130,8 @@ _C.TRAIN.STARTING_EPOCH = 1  # the first epoch to start with, useful to continue
 # the base learning rate are computed from BASE_LR and LR_SCHEDULE.
 # Therefore, there is *no need* to modify the config if you only change the number of GPUs.

-# _C.TRAIN.LR_SCHEDULE = [120000, 160000, 180000]      # "1x" schedule in detectron
-_C.TRAIN.LR_SCHEDULE = [240000, 320000, 360000]      # "2x" schedule in detectron
+_C.TRAIN.LR_SCHEDULE = [120000, 160000, 180000]      # "1x" schedule in detectron
+# _C.TRAIN.LR_SCHEDULE = [240000, 320000, 360000]      # "2x" schedule in detectron
 # Longer schedules for from-scratch training (https://arxiv.org/abs/1811.08883):
 # _C.TRAIN.LR_SCHEDULE = [960000, 1040000, 1080000]    # "6x" schedule in detectron
 # _C.TRAIN.LR_SCHEDULE = [1500000, 1580000, 1620000]   # "9x" schedule in detectron

--- a/examples/FasterRCNN/data.py
+++ b/examples/FasterRCNN/data.py
--- a/examples/FasterRCNN/eval.py
+++ b/examples/FasterRCNN/eval.py
@@ -13,6 +13,7 @@ from contextlib import ExitStack
 import cv2
 import pycocotools.mask as cocomask
 import tqdm
+from scipy import interpolate

 from tensorpack.callbacks import Callback
 from tensorpack.tfutils.common import get_tf_version_tuple
@@ -41,6 +42,23 @@ mask: None, or a binary image of the original image shape
 """


+def _scale_box(box, scale):
+    w_half = (box[2] - box[0]) * 0.5
+    h_half = (box[3] - box[1]) * 0.5
+    x_c = (box[2] + box[0]) * 0.5
+    y_c = (box[3] + box[1]) * 0.5
+
+    w_half *= scale
+    h_half *= scale
+
+    scaled_box = np.zeros_like(box)
+    scaled_box[0] = x_c - w_half
+    scaled_box[2] = x_c + w_half
+    scaled_box[1] = y_c - h_half
+    scaled_box[3] = y_c + h_half
+    return scaled_box
+
+
 def _paste_mask(box, mask, shape):
    """
    Args:
@@ -50,23 +68,42 @@ def _paste_mask(box, mask, shape):
    Returns:
        A uint8 binary image of hxw.
    """
-    # int() is floor
-    # box fpcoor=0.0 -> intcoor=0.0
-    x0, y0 = list(map(int, box[:2] + 0.5))
-    # box fpcoor=h -> intcoor=h-1, inclusive
-    x1, y1 = list(map(int, box[2:] - 0.5))    # inclusive
-    x1 = max(x0, x1)    # require at least 1x1
-    y1 = max(y0, y1)
+    assert mask.shape[0] == mask.shape[1], mask.shape
+
+    if True:
+        # This method is accurate but much slower.
+        mask = np.pad(mask, [(1, 1), (1, 1)], mode='constant')
+        box = _scale_box(box, float(mask.shape[0]) / (mask.shape[0] - 2))
+
+        mask_pixels = np.arange(0.0, mask.shape[0]) + 0.5
+        mask_continuous = interpolate.interp2d(mask_pixels, mask_pixels, mask, fill_value=0.0)
+        h, w = shape
+        ys = np.arange(0.0, h) + 0.5
+        xs = np.arange(0.0, w) + 0.5
+        ys = (ys - box[1]) / (box[3] - box[1]) * mask.shape[0]
+        xs = (xs - box[0]) / (box[2] - box[0]) * mask.shape[1]
+        res = mask_continuous(xs, ys)
+        return (res >= 0.5).astype('uint8')
+    else:
+        # This method (inspired by Detectron) is less accurate but fast.
+
+        # int() is floor
+        # box fpcoor=0.0 -> intcoor=0.0
+        x0, y0 = list(map(int, box[:2] + 0.5))
+        # box fpcoor=h -> intcoor=h-1, inclusive
+        x1, y1 = list(map(int, box[2:] - 0.5))    # inclusive
+        x1 = max(x0, x1)    # require at least 1x1
+        y1 = max(y0, y1)

-    w = x1 + 1 - x0
-    h = y1 + 1 - y0
+        w = x1 + 1 - x0
+        h = y1 + 1 - y0

-    # rounding errors could happen here, because masks were not originally computed for this shape.
-    # but it's hard to do better, because the network does not know the "original" scale
-    mask = (cv2.resize(mask, (w, h)) > 0.5).astype('uint8')
-    ret = np.zeros(shape, dtype='uint8')
-    ret[y0:y1 + 1, x0:x1 + 1] = mask
-    return ret
+        # rounding errors could happen here, because masks were not originally computed for this shape.
+        # but it's hard to do better, because the network does not know the "original" scale
+        mask = (cv2.resize(mask, (w, h)) > 0.5).astype('uint8')
+        ret = np.zeros(shape, dtype='uint8')
+        ret[y0:y1 + 1, x0:x1 + 1] = mask
+        return ret


 def predict_image(img, model_func):
@@ -82,7 +119,6 @@ def predict_image(img, model_func):
    Returns:
        [DetectionResult]
    """
-
    orig_shape = img.shape[:2]
    resizer = CustomResize(cfg.PREPROC.TEST_SHORT_EDGE_SIZE, cfg.PREPROC.MAX_SIZE)
    resized_img = resizer.augment(img)

--- a/examples/FasterRCNN/utils/README.md
+++ b/examples/FasterRCNN/utils/README.md

 # Some third-party helper functions

-+ generate_anchors.py: copied from [py-faster-rcnn](https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/generate_anchors.py).
 + box_ops.py: modified from [TF object detection API](https://github.com/tensorflow/models/blob/master/research/object_detection/core/box_list_ops.py).
 + np_box_ops.py: copied from [TF object detection API](https://github.com/tensorflow/models/blob/master/research/object_detection/utils/np_box_ops.py).

--- a/examples/FasterRCNN/utils/generate_anchors.py
+++ b/examples/FasterRCNN/utils/generate_anchors.py
-# https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/generate_anchors.py
-
-# --------------------------------------------------------
-# Faster R-CNN
-# Copyright (c) 2015 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Ross Girshick and Sean Bell
-# --------------------------------------------------------
-
-import numpy as np
-from six.moves import range
-
-# Verify that we compute the same anchors as Shaoqing's matlab implementation:
-#
-#    >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat
-#    >> anchors
-#
-#    anchors =
-#
-#       -83   -39   100    56
-#      -175   -87   192   104
-#      -359  -183   376   200
-#       -55   -55    72    72
-#      -119  -119   136   136
-#      -247  -247   264   264
-#       -35   -79    52    96
-#       -79  -167    96   184
-#      -167  -343   184   360
-
-# array([[ -83.,  -39.,  100.,   56.],
-#       [-175.,  -87.,  192.,  104.],
-#       [-359., -183.,  376.,  200.],
-#       [ -55.,  -55.,   72.,   72.],
-#       [-119., -119.,  136.,  136.],
-#       [-247., -247.,  264.,  264.],
-#       [ -35.,  -79.,   52.,   96.],
-#       [ -79., -167.,   96.,  184.],
-#       [-167., -343.,  184.,  360.]])
-
-
-def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
-                     scales=2**np.arange(3, 6)):
-    """
-    Generate anchor (reference) windows by enumerating aspect ratios X
-    scales wrt a reference (0, 0, 15, 15) window.
-    """
-
-    base_anchor = np.array([1, 1, base_size, base_size], dtype='float32') - 1
-    ratio_anchors = _ratio_enum(base_anchor, ratios)
-    anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
-                         for i in range(ratio_anchors.shape[0])])
-    return anchors
-
-
-def _whctrs(anchor):
-    """
-    Return width, height, x center, and y center for an anchor (window).
-    """
-
-    w = anchor[2] - anchor[0] + 1
-    h = anchor[3] - anchor[1] + 1
-    x_ctr = anchor[0] + 0.5 * (w - 1)
-    y_ctr = anchor[1] + 0.5 * (h - 1)
-    return w, h, x_ctr, y_ctr
-
-
-def _mkanchors(ws, hs, x_ctr, y_ctr):
-    """
-    Given a vector of widths (ws) and heights (hs) around a center
-    (x_ctr, y_ctr), output a set of anchors (windows).
-    """
-
-    ws = ws[:, np.newaxis]
-    hs = hs[:, np.newaxis]
-    anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
-                         y_ctr - 0.5 * (hs - 1),
-                         x_ctr + 0.5 * (ws - 1),
-                         y_ctr + 0.5 * (hs - 1)))
-    return anchors
-
-
-def _ratio_enum(anchor, ratios):
-    """
-    Enumerate a set of anchors for each aspect ratio wrt an anchor.
-    """
-
-    w, h, x_ctr, y_ctr = _whctrs(anchor)
-    size = w * h
-    size_ratios = size / ratios
-    ws = np.round(np.sqrt(size_ratios))
-    hs = np.round(ws * ratios)
-    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
-    return anchors
-
-
-def _scale_enum(anchor, scales):
-    """
-    Enumerate a set of anchors for each scale wrt an anchor.
-    """
-
-    w, h, x_ctr, y_ctr = _whctrs(anchor)
-    ws = w * scales
-    hs = h * scales
-    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
-    return anchors