Commit 69d4e940 authored by Yuxin Wu's avatar Yuxin Wu

[MaskRCNN] improvements on dataset loading

parent 7a19c73f
......@@ -82,12 +82,14 @@ _C.MODE_FPN = False
# dataset -----------------------
_C.DATA.BASEDIR = '/path/to/your/DATA/DIR'
# All TRAIN dataset will be concatenated for training.
_C.DATA.TRAIN = ['train2014', 'valminusminival2014'] # i.e. trainval35k, AKA train2017
_C.DATA.TRAIN = ('train2014', 'valminusminival2014') # i.e. trainval35k, AKA train2017
# Each VAL dataset will be evaluated separately (instead of concatenated)
_C.DATA.VAL = ('minival2014', ) # AKA val2017
# This two config will be populated later by the dataset loader:
_C.DATA.NUM_CATEGORY = 0 # without the background class (e.g., 80 for COCO)
_C.DATA.CLASS_NAMES = [] # NUM_CLASS (NUM_CATEGORY+1) strings, the first is "BG".
# whether the coordinates in the annotations are absolute pixel values, or a relative value in [0, 1]
_C.DATA.ABSOLUTE_COORD = True
# basemodel ----------------------
_C.BACKBONE.WEIGHTS = '' # /path/to/weights.npz
......
......@@ -305,9 +305,14 @@ def get_train_dataflow():
im = cv2.imread(fname, cv2.IMREAD_COLOR)
assert im is not None, fname
im = im.astype('float32')
height, width = im.shape[:2]
# assume floatbox as input
assert boxes.dtype == np.float32, "Loader has to return floating point boxes!"
if not cfg.DATA.ABSOLUTE_COORD:
boxes[:, 0::2] *= width
boxes[:, 1::2] *= height
# augmentation:
im, params = aug.augment_return_params(im)
points = box_to_point8(boxes)
......@@ -346,7 +351,10 @@ def get_train_dataflow():
# Apply augmentation on polygon coordinates.
# And produce one image-sized binary mask per box.
masks = []
width_height = np.asarray([width, height], dtype=np.float32)
for polys in segmentation:
if not cfg.DATA.ABSOLUTE_COORD:
polys = [p * width_height for p in polys]
polys = [aug.augment_coords(p, params) for p in polys]
masks.append(segmentation_to_mask(polys, im.shape[0], im.shape[1]))
masks = np.asarray(masks, dtype='uint8') # values in {0, 1}
......@@ -380,7 +388,7 @@ def get_eval_dataflow(name, shard=0, num_shards=1):
img_range = (shard * img_per_shard, (shard + 1) * img_per_shard if shard + 1 < num_shards else num_imgs)
# no filter for training
ds = DataFromListOfDict(roidbs[img_range[0]: img_range[1]], ['file_name', 'id'])
ds = DataFromListOfDict(roidbs[img_range[0]: img_range[1]], ['file_name', 'image_id'])
def f(fname):
im = cv2.imread(fname, cv2.IMREAD_COLOR)
......
......@@ -14,7 +14,7 @@ from config import config as cfg
__all__ = ['COCODetection', 'DetectionDataset']
class COCODetection(object):
class COCODetection:
# handle the weird (but standard) split of train and val
_INSTANCE_TO_BASEDIR = {
'valminusminival2014': 'val2014',
......@@ -32,6 +32,7 @@ class COCODetection(object):
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] # noqa
def __init__(self, basedir, name):
basedir = os.path.expanduser(basedir)
self.name = name
self._imgdir = os.path.realpath(os.path.join(
basedir, self._INSTANCE_TO_BASEDIR.get(name, name)))
......@@ -81,7 +82,7 @@ class COCODetection(object):
Returns:
a list of dict, each has keys including:
'height', 'width', 'id', 'file_name',
'id', 'file_name',
and (if add_gt is True) 'boxes', 'class', 'is_crowd', and optionally
'segmentation'.
"""
......@@ -118,8 +119,8 @@ class COCODetection(object):
# clean-up boxes
valid_objs = []
width = img['width']
height = img['height']
width = img.pop('width')
height = img.pop('height')
for objid, obj in enumerate(objs):
if obj.get('ignore', 0) == 1:
continue
......@@ -162,6 +163,7 @@ class COCODetection(object):
img['boxes'] = boxes # nx4
img['class'] = cls # n, always >0
img['is_crowd'] = is_crowd # n,
img['image_id'] = img.pop('id')
if add_mask:
# also required to be float32
img['segmentation'] = [
......@@ -183,7 +185,7 @@ class COCODetection(object):
return ret
class DetectionDataset(object):
class DetectionDataset:
"""
A singleton to load datasets, evaluate results, and provide metadata.
......@@ -209,7 +211,6 @@ class DetectionDataset(object):
Produce "roidbs" as a list of dict, each dict corresponds to one image with k>=0 instances.
and the following keys are expected for training:
height, width: integer
file_name: str, full path to the image
boxes: numpy array of kx4 floats, each row is [x1, y1, x2, y2]
class: numpy array of k integers, in the range of [1, #categories], NOT [0, #categories)
......@@ -225,7 +226,7 @@ class DetectionDataset(object):
Include this field only if training Mask R-CNN.
"""
return COCODetection.load_many(
cfg.DATA.BASEDIR, cfg.DATA.TRAIN, add_gt=True, add_mask=cfg.MODE_MASK)
cfg.DATA.BASEDIR, names, add_gt=True, add_mask=cfg.MODE_MASK)
def load_inference_roidbs(self, name):
"""
......@@ -239,7 +240,7 @@ class DetectionDataset(object):
following keys in the dict are expected:
file_name (str): full path to the image
id (str): an id for the image. The inference results will be stored with this id.
image_id (str): an id for the image. The inference results will be stored with this id.
"""
return COCODetection.load_many(cfg.DATA.BASEDIR, name, add_gt=False)
......@@ -274,7 +275,7 @@ class DetectionDataset(object):
assert output is not None, "COCO evaluation requires an output file!"
with open(output, 'w') as f:
json.dump(results, f)
if len(output):
if len(results):
# sometimes may crash if the results are empty?
return COCODetection(cfg.DATA.BASEDIR, dataset).print_coco_metrics(output)
else:
......@@ -290,6 +291,7 @@ class DetectionDataset(object):
if __name__ == '__main__':
cfg.DATA.BASEDIR = '~/data/coco'
c = COCODetection(cfg.DATA.BASEDIR, 'train2014')
gt_boxes = c.load(add_gt=True, add_mask=True)
print("#Images:", len(gt_boxes))
roidb = c.load(add_gt=True, add_mask=True)
print("#Images:", len(roidb))
......@@ -127,10 +127,11 @@ def predict_dataflow(df, model_func, tqdm_bar=None):
for img, img_id in df:
results = predict_image(img, model_func)
for r in results:
# int()/float() to make it json-serializable
res = {
'image_id': img_id,
'category_id': int(r.class_id), # int() to make it json-serializable
'bbox': list(r.box),
'category_id': int(r.class_id),
'bbox': [round(float(x), 4) for x in r.box],
'score': round(float(r.score), 4),
}
......
......@@ -4,6 +4,7 @@
import operator
import os
import numpy as np
from abc import ABCMeta, abstractmethod
from collections import deque
import six
......@@ -272,7 +273,7 @@ class ScheduledHyperParamSetter(HyperParamSetter):
for p in range(0, self._current_point() + 1):
v = self._get_value_to_set_at_point(p) or v
actual_value = self.param.get_value()
if v is not None and v != actual_value:
if v is not None and not np.isclose(v, actual_value):
logger.warn("According to scheduler {}, parameter '{}' should become {} at the current point. "
"However its current value is {}. "
"If this is the only scheduler being used, you may want to check whether your "
......
......@@ -96,8 +96,8 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699
sync_statistics (str or None): one of None, "nccl", or "horovod".
By default (None), it uses statistics of the input tensor to normalize.
This is the standard way BatchNorm was done in most frameworks.
By default (None), it uses statistics of the input tensor to normalize during training.
This is the standard way BatchNorm was implemented in most frameworks.
When set to "nccl", this layer must be used under tensorpack's multi-GPU trainers.
It uses the aggregated statistics of the whole batch (across all GPUs) to normalize.
......@@ -106,7 +106,7 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
It uses the aggregated statistics of the whole batch (across all MPI ranks) to normalize.
Note that on single machine this is significantly slower than the "nccl" implementation.
If not None, per-GPU E[x] and E[x^2] among all GPUs are averaged to compute
When enabled, per-GPU E[x] and E[x^2] among all GPUs are averaged to compute
global mean & variance. Therefore each GPU needs to have the same batch size.
The synchronization is based on the current variable scope + the name of the layer
......@@ -119,7 +119,7 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
If different GPUs execute one BatchNorm layer for different number of times
(e.g., if some GPUs do not execute it), this layer may hang.
This option only has effect in standard training mode.
This option only has effect when `training == get_current_tower_context().training == True`.
This option is also known as "Cross-GPU BatchNorm" as mentioned in:
`MegDet: A Large Mini-Batch Object Detector <https://arxiv.org/abs/1711.07240>`_.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment