[MaskRCNN] improvements on dataset loading

69d4e940 · Yuxin Wu · 7a19c73f · 69d4e940 · 69d4e940 · 69d4e940
Commit 69d4e940 authored Feb 24, 2019 by Yuxin Wu
6 changed files
--- a/examples/FasterRCNN/config.py
+++ b/examples/FasterRCNN/config.py
@@ -82,12 +82,14 @@ _C.MODE_FPN = False
 # dataset -----------------------
 _C.DATA.BASEDIR = '/path/to/your/DATA/DIR'
 # All TRAIN dataset will be concatenated for training.
-_C.DATA.TRAIN = ['train2014', 'valminusminival2014']   # i.e. trainval35k, AKA train2017
+_C.DATA.TRAIN = ('train2014', 'valminusminival2014')   # i.e. trainval35k, AKA train2017
 # Each VAL dataset will be evaluated separately (instead of concatenated)
 _C.DATA.VAL = ('minival2014', )  # AKA val2017
 # This two config will be populated later by the dataset loader:
 _C.DATA.NUM_CATEGORY = 0  # without the background class (e.g., 80 for COCO)
 _C.DATA.CLASS_NAMES = []  # NUM_CLASS (NUM_CATEGORY+1) strings, the first is "BG".
+# whether the coordinates in the annotations are absolute pixel values, or a relative value in [0, 1]
+_C.DATA.ABSOLUTE_COORD = True

 # basemodel ----------------------
 _C.BACKBONE.WEIGHTS = ''   # /path/to/weights.npz

--- a/examples/FasterRCNN/data.py
+++ b/examples/FasterRCNN/data.py
@@ -305,9 +305,14 @@ def get_train_dataflow():
        im = cv2.imread(fname, cv2.IMREAD_COLOR)
        assert im is not None, fname
        im = im.astype('float32')
+        height, width = im.shape[:2]
        # assume floatbox as input
        assert boxes.dtype == np.float32, "Loader has to return floating point boxes!"

+        if not cfg.DATA.ABSOLUTE_COORD:
+            boxes[:, 0::2] *= width
+            boxes[:, 1::2] *= height
+
        # augmentation:
        im, params = aug.augment_return_params(im)
        points = box_to_point8(boxes)
@@ -346,7 +351,10 @@ def get_train_dataflow():
            # Apply augmentation on polygon coordinates.
            # And produce one image-sized binary mask per box.
            masks = []
+            width_height = np.asarray([width, height], dtype=np.float32)
            for polys in segmentation:
+                if not cfg.DATA.ABSOLUTE_COORD:
+                    polys = [p * width_height for p in polys]
                polys = [aug.augment_coords(p, params) for p in polys]
                masks.append(segmentation_to_mask(polys, im.shape[0], im.shape[1]))
            masks = np.asarray(masks, dtype='uint8')    # values in {0, 1}
@@ -380,7 +388,7 @@ def get_eval_dataflow(name, shard=0, num_shards=1):
    img_range = (shard * img_per_shard, (shard + 1) * img_per_shard if shard + 1 < num_shards else num_imgs)

    # no filter for training
-    ds = DataFromListOfDict(roidbs[img_range[0]: img_range[1]], ['file_name', 'id'])
+    ds = DataFromListOfDict(roidbs[img_range[0]: img_range[1]], ['file_name', 'image_id'])

    def f(fname):
        im = cv2.imread(fname, cv2.IMREAD_COLOR)

--- a/examples/FasterRCNN/dataset.py
+++ b/examples/FasterRCNN/dataset.py
@@ -14,7 +14,7 @@ from config import config as cfg
 __all__ = ['COCODetection', 'DetectionDataset']


-class COCODetection(object):
+class COCODetection:
    # handle the weird (but standard) split of train and val
    _INSTANCE_TO_BASEDIR = {
        'valminusminival2014': 'val2014',
@@ -32,6 +32,7 @@ class COCODetection(object):
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"]  # noqa

    def __init__(self, basedir, name):
+        basedir = os.path.expanduser(basedir)
        self.name = name
        self._imgdir = os.path.realpath(os.path.join(
            basedir, self._INSTANCE_TO_BASEDIR.get(name, name)))
@@ -81,7 +82,7 @@ class COCODetection(object):

        Returns:
            a list of dict, each has keys including:
-                'height', 'width', 'id', 'file_name',
+                'id', 'file_name',
                and (if add_gt is True) 'boxes', 'class', 'is_crowd', and optionally
                'segmentation'.
        """
@@ -118,8 +119,8 @@ class COCODetection(object):

        # clean-up boxes
        valid_objs = []
-        width = img['width']
-        height = img['height']
+        width = img.pop('width')
+        height = img.pop('height')
        for objid, obj in enumerate(objs):
            if obj.get('ignore', 0) == 1:
                continue
@@ -162,6 +163,7 @@ class COCODetection(object):
        img['boxes'] = boxes        # nx4
        img['class'] = cls          # n, always >0
        img['is_crowd'] = is_crowd  # n,
+        img['image_id'] = img.pop('id')
        if add_mask:
            # also required to be float32
            img['segmentation'] = [
@@ -183,7 +185,7 @@ class COCODetection(object):
        return ret


-class DetectionDataset(object):
+class DetectionDataset:
    """
    A singleton to load datasets, evaluate results, and provide metadata.

@@ -209,7 +211,6 @@ class DetectionDataset(object):
        Produce "roidbs" as a list of dict, each dict corresponds to one image with k>=0 instances.
        and the following keys are expected for training:

-        height, width: integer
        file_name: str, full path to the image
        boxes: numpy array of kx4 floats, each row is [x1, y1, x2, y2]
        class: numpy array of k integers, in the range of [1, #categories], NOT [0, #categories)
@@ -225,7 +226,7 @@ class DetectionDataset(object):
            Include this field only if training Mask R-CNN.
        """
        return COCODetection.load_many(
-            cfg.DATA.BASEDIR, cfg.DATA.TRAIN, add_gt=True, add_mask=cfg.MODE_MASK)
+            cfg.DATA.BASEDIR, names, add_gt=True, add_mask=cfg.MODE_MASK)

    def load_inference_roidbs(self, name):
        """
@@ -239,7 +240,7 @@ class DetectionDataset(object):
            following keys in the dict are expected:

            file_name (str): full path to the image
-            id (str): an id for the image. The inference results will be stored with this id.
+            image_id (str): an id for the image. The inference results will be stored with this id.
        """
        return COCODetection.load_many(cfg.DATA.BASEDIR, name, add_gt=False)

@@ -274,7 +275,7 @@ class DetectionDataset(object):
        assert output is not None, "COCO evaluation requires an output file!"
        with open(output, 'w') as f:
            json.dump(results, f)
-        if len(output):
+        if len(results):
            # sometimes may crash if the results are empty?
            return COCODetection(cfg.DATA.BASEDIR, dataset).print_coco_metrics(output)
        else:
@@ -290,6 +291,7 @@ class DetectionDataset(object):


 if __name__ == '__main__':
+    cfg.DATA.BASEDIR = '~/data/coco'
    c = COCODetection(cfg.DATA.BASEDIR, 'train2014')
-    gt_boxes = c.load(add_gt=True, add_mask=True)
-    print("#Images:", len(gt_boxes))
+    roidb = c.load(add_gt=True, add_mask=True)
+    print("#Images:", len(roidb))
--- a/examples/FasterRCNN/eval.py
+++ b/examples/FasterRCNN/eval.py
@@ -127,10 +127,11 @@ def predict_dataflow(df, model_func, tqdm_bar=None):
        for img, img_id in df:
            results = predict_image(img, model_func)
            for r in results:
+                # int()/float() to make it json-serializable
                res = {
                    'image_id': img_id,
-                    'category_id': int(r.class_id),  # int() to make it json-serializable
-                    'bbox': list(r.box),
+                    'category_id': int(r.class_id),
+                    'bbox': [round(float(x), 4) for x in r.box],
                    'score': round(float(r.score), 4),
                }


--- a/tensorpack/callbacks/param.py
+++ b/tensorpack/callbacks/param.py
@@ -4,6 +4,7 @@

 import operator
 import os
+import numpy as np
 from abc import ABCMeta, abstractmethod
 from collections import deque
 import six
@@ -272,7 +273,7 @@ class ScheduledHyperParamSetter(HyperParamSetter):
        for p in range(0, self._current_point() + 1):
            v = self._get_value_to_set_at_point(p) or v
        actual_value = self.param.get_value()
-        if v is not None and v != actual_value:
+        if v is not None and not np.isclose(v, actual_value):
            logger.warn("According to scheduler {}, parameter '{}' should become {} at the current point. "
                        "However its current value is {}. "
                        "If this is the only scheduler being used, you may want to check whether your "

--- a/tensorpack/models/batch_norm.py
+++ b/tensorpack/models/batch_norm.py
@@ -96,8 +96,8 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699
        sync_statistics (str or None): one of None, "nccl", or "horovod".

-          By default (None), it uses statistics of the input tensor to normalize.
-          This is the standard way BatchNorm was done in most frameworks.
+          By default (None), it uses statistics of the input tensor to normalize during training.
+          This is the standard way BatchNorm was implemented in most frameworks.

          When set to "nccl", this layer must be used under tensorpack's multi-GPU trainers.
          It uses the aggregated statistics of the whole batch (across all GPUs) to normalize.
@@ -106,7 +106,7 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
          It uses the aggregated statistics of the whole batch (across all MPI ranks) to normalize.
          Note that on single machine this is significantly slower than the "nccl" implementation.

-          If not None, per-GPU E[x] and E[x^2] among all GPUs are averaged to compute
+          When enabled, per-GPU E[x] and E[x^2] among all GPUs are averaged to compute
          global mean & variance. Therefore each GPU needs to have the same batch size.

          The synchronization is based on the current variable scope + the name of the layer
@@ -119,7 +119,7 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
             If different GPUs execute one BatchNorm layer for different number of times
             (e.g., if some GPUs do not execute it), this layer may hang.

-          This option only has effect in standard training mode.
+          This option only has effect when `training == get_current_tower_context().training == True`.

          This option is also known as "Cross-GPU BatchNorm" as mentioned in:
          `MegDet: A Large Mini-Batch Object Detector <https://arxiv.org/abs/1711.07240>`_.