Commit 69d4e940 authored by Yuxin Wu's avatar Yuxin Wu

[MaskRCNN] improvements on dataset loading

parent 7a19c73f
...@@ -82,12 +82,14 @@ _C.MODE_FPN = False ...@@ -82,12 +82,14 @@ _C.MODE_FPN = False
# dataset ----------------------- # dataset -----------------------
_C.DATA.BASEDIR = '/path/to/your/DATA/DIR' _C.DATA.BASEDIR = '/path/to/your/DATA/DIR'
# All TRAIN dataset will be concatenated for training. # All TRAIN dataset will be concatenated for training.
_C.DATA.TRAIN = ['train2014', 'valminusminival2014'] # i.e. trainval35k, AKA train2017 _C.DATA.TRAIN = ('train2014', 'valminusminival2014') # i.e. trainval35k, AKA train2017
# Each VAL dataset will be evaluated separately (instead of concatenated) # Each VAL dataset will be evaluated separately (instead of concatenated)
_C.DATA.VAL = ('minival2014', ) # AKA val2017 _C.DATA.VAL = ('minival2014', ) # AKA val2017
# This two config will be populated later by the dataset loader: # This two config will be populated later by the dataset loader:
_C.DATA.NUM_CATEGORY = 0 # without the background class (e.g., 80 for COCO) _C.DATA.NUM_CATEGORY = 0 # without the background class (e.g., 80 for COCO)
_C.DATA.CLASS_NAMES = [] # NUM_CLASS (NUM_CATEGORY+1) strings, the first is "BG". _C.DATA.CLASS_NAMES = [] # NUM_CLASS (NUM_CATEGORY+1) strings, the first is "BG".
# whether the coordinates in the annotations are absolute pixel values, or a relative value in [0, 1]
_C.DATA.ABSOLUTE_COORD = True
# basemodel ---------------------- # basemodel ----------------------
_C.BACKBONE.WEIGHTS = '' # /path/to/weights.npz _C.BACKBONE.WEIGHTS = '' # /path/to/weights.npz
......
...@@ -305,9 +305,14 @@ def get_train_dataflow(): ...@@ -305,9 +305,14 @@ def get_train_dataflow():
im = cv2.imread(fname, cv2.IMREAD_COLOR) im = cv2.imread(fname, cv2.IMREAD_COLOR)
assert im is not None, fname assert im is not None, fname
im = im.astype('float32') im = im.astype('float32')
height, width = im.shape[:2]
# assume floatbox as input # assume floatbox as input
assert boxes.dtype == np.float32, "Loader has to return floating point boxes!" assert boxes.dtype == np.float32, "Loader has to return floating point boxes!"
if not cfg.DATA.ABSOLUTE_COORD:
boxes[:, 0::2] *= width
boxes[:, 1::2] *= height
# augmentation: # augmentation:
im, params = aug.augment_return_params(im) im, params = aug.augment_return_params(im)
points = box_to_point8(boxes) points = box_to_point8(boxes)
...@@ -346,7 +351,10 @@ def get_train_dataflow(): ...@@ -346,7 +351,10 @@ def get_train_dataflow():
# Apply augmentation on polygon coordinates. # Apply augmentation on polygon coordinates.
# And produce one image-sized binary mask per box. # And produce one image-sized binary mask per box.
masks = [] masks = []
width_height = np.asarray([width, height], dtype=np.float32)
for polys in segmentation: for polys in segmentation:
if not cfg.DATA.ABSOLUTE_COORD:
polys = [p * width_height for p in polys]
polys = [aug.augment_coords(p, params) for p in polys] polys = [aug.augment_coords(p, params) for p in polys]
masks.append(segmentation_to_mask(polys, im.shape[0], im.shape[1])) masks.append(segmentation_to_mask(polys, im.shape[0], im.shape[1]))
masks = np.asarray(masks, dtype='uint8') # values in {0, 1} masks = np.asarray(masks, dtype='uint8') # values in {0, 1}
...@@ -380,7 +388,7 @@ def get_eval_dataflow(name, shard=0, num_shards=1): ...@@ -380,7 +388,7 @@ def get_eval_dataflow(name, shard=0, num_shards=1):
img_range = (shard * img_per_shard, (shard + 1) * img_per_shard if shard + 1 < num_shards else num_imgs) img_range = (shard * img_per_shard, (shard + 1) * img_per_shard if shard + 1 < num_shards else num_imgs)
# no filter for training # no filter for training
ds = DataFromListOfDict(roidbs[img_range[0]: img_range[1]], ['file_name', 'id']) ds = DataFromListOfDict(roidbs[img_range[0]: img_range[1]], ['file_name', 'image_id'])
def f(fname): def f(fname):
im = cv2.imread(fname, cv2.IMREAD_COLOR) im = cv2.imread(fname, cv2.IMREAD_COLOR)
......
...@@ -14,7 +14,7 @@ from config import config as cfg ...@@ -14,7 +14,7 @@ from config import config as cfg
__all__ = ['COCODetection', 'DetectionDataset'] __all__ = ['COCODetection', 'DetectionDataset']
class COCODetection(object): class COCODetection:
# handle the weird (but standard) split of train and val # handle the weird (but standard) split of train and val
_INSTANCE_TO_BASEDIR = { _INSTANCE_TO_BASEDIR = {
'valminusminival2014': 'val2014', 'valminusminival2014': 'val2014',
...@@ -32,6 +32,7 @@ class COCODetection(object): ...@@ -32,6 +32,7 @@ class COCODetection(object):
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] # noqa "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] # noqa
def __init__(self, basedir, name): def __init__(self, basedir, name):
basedir = os.path.expanduser(basedir)
self.name = name self.name = name
self._imgdir = os.path.realpath(os.path.join( self._imgdir = os.path.realpath(os.path.join(
basedir, self._INSTANCE_TO_BASEDIR.get(name, name))) basedir, self._INSTANCE_TO_BASEDIR.get(name, name)))
...@@ -81,7 +82,7 @@ class COCODetection(object): ...@@ -81,7 +82,7 @@ class COCODetection(object):
Returns: Returns:
a list of dict, each has keys including: a list of dict, each has keys including:
'height', 'width', 'id', 'file_name', 'id', 'file_name',
and (if add_gt is True) 'boxes', 'class', 'is_crowd', and optionally and (if add_gt is True) 'boxes', 'class', 'is_crowd', and optionally
'segmentation'. 'segmentation'.
""" """
...@@ -118,8 +119,8 @@ class COCODetection(object): ...@@ -118,8 +119,8 @@ class COCODetection(object):
# clean-up boxes # clean-up boxes
valid_objs = [] valid_objs = []
width = img['width'] width = img.pop('width')
height = img['height'] height = img.pop('height')
for objid, obj in enumerate(objs): for objid, obj in enumerate(objs):
if obj.get('ignore', 0) == 1: if obj.get('ignore', 0) == 1:
continue continue
...@@ -162,6 +163,7 @@ class COCODetection(object): ...@@ -162,6 +163,7 @@ class COCODetection(object):
img['boxes'] = boxes # nx4 img['boxes'] = boxes # nx4
img['class'] = cls # n, always >0 img['class'] = cls # n, always >0
img['is_crowd'] = is_crowd # n, img['is_crowd'] = is_crowd # n,
img['image_id'] = img.pop('id')
if add_mask: if add_mask:
# also required to be float32 # also required to be float32
img['segmentation'] = [ img['segmentation'] = [
...@@ -183,7 +185,7 @@ class COCODetection(object): ...@@ -183,7 +185,7 @@ class COCODetection(object):
return ret return ret
class DetectionDataset(object): class DetectionDataset:
""" """
A singleton to load datasets, evaluate results, and provide metadata. A singleton to load datasets, evaluate results, and provide metadata.
...@@ -209,7 +211,6 @@ class DetectionDataset(object): ...@@ -209,7 +211,6 @@ class DetectionDataset(object):
Produce "roidbs" as a list of dict, each dict corresponds to one image with k>=0 instances. Produce "roidbs" as a list of dict, each dict corresponds to one image with k>=0 instances.
and the following keys are expected for training: and the following keys are expected for training:
height, width: integer
file_name: str, full path to the image file_name: str, full path to the image
boxes: numpy array of kx4 floats, each row is [x1, y1, x2, y2] boxes: numpy array of kx4 floats, each row is [x1, y1, x2, y2]
class: numpy array of k integers, in the range of [1, #categories], NOT [0, #categories) class: numpy array of k integers, in the range of [1, #categories], NOT [0, #categories)
...@@ -225,7 +226,7 @@ class DetectionDataset(object): ...@@ -225,7 +226,7 @@ class DetectionDataset(object):
Include this field only if training Mask R-CNN. Include this field only if training Mask R-CNN.
""" """
return COCODetection.load_many( return COCODetection.load_many(
cfg.DATA.BASEDIR, cfg.DATA.TRAIN, add_gt=True, add_mask=cfg.MODE_MASK) cfg.DATA.BASEDIR, names, add_gt=True, add_mask=cfg.MODE_MASK)
def load_inference_roidbs(self, name): def load_inference_roidbs(self, name):
""" """
...@@ -239,7 +240,7 @@ class DetectionDataset(object): ...@@ -239,7 +240,7 @@ class DetectionDataset(object):
following keys in the dict are expected: following keys in the dict are expected:
file_name (str): full path to the image file_name (str): full path to the image
id (str): an id for the image. The inference results will be stored with this id. image_id (str): an id for the image. The inference results will be stored with this id.
""" """
return COCODetection.load_many(cfg.DATA.BASEDIR, name, add_gt=False) return COCODetection.load_many(cfg.DATA.BASEDIR, name, add_gt=False)
...@@ -274,7 +275,7 @@ class DetectionDataset(object): ...@@ -274,7 +275,7 @@ class DetectionDataset(object):
assert output is not None, "COCO evaluation requires an output file!" assert output is not None, "COCO evaluation requires an output file!"
with open(output, 'w') as f: with open(output, 'w') as f:
json.dump(results, f) json.dump(results, f)
if len(output): if len(results):
# sometimes may crash if the results are empty? # sometimes may crash if the results are empty?
return COCODetection(cfg.DATA.BASEDIR, dataset).print_coco_metrics(output) return COCODetection(cfg.DATA.BASEDIR, dataset).print_coco_metrics(output)
else: else:
...@@ -290,6 +291,7 @@ class DetectionDataset(object): ...@@ -290,6 +291,7 @@ class DetectionDataset(object):
if __name__ == '__main__': if __name__ == '__main__':
cfg.DATA.BASEDIR = '~/data/coco'
c = COCODetection(cfg.DATA.BASEDIR, 'train2014') c = COCODetection(cfg.DATA.BASEDIR, 'train2014')
gt_boxes = c.load(add_gt=True, add_mask=True) roidb = c.load(add_gt=True, add_mask=True)
print("#Images:", len(gt_boxes)) print("#Images:", len(roidb))
...@@ -127,10 +127,11 @@ def predict_dataflow(df, model_func, tqdm_bar=None): ...@@ -127,10 +127,11 @@ def predict_dataflow(df, model_func, tqdm_bar=None):
for img, img_id in df: for img, img_id in df:
results = predict_image(img, model_func) results = predict_image(img, model_func)
for r in results: for r in results:
# int()/float() to make it json-serializable
res = { res = {
'image_id': img_id, 'image_id': img_id,
'category_id': int(r.class_id), # int() to make it json-serializable 'category_id': int(r.class_id),
'bbox': list(r.box), 'bbox': [round(float(x), 4) for x in r.box],
'score': round(float(r.score), 4), 'score': round(float(r.score), 4),
} }
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
import operator import operator
import os import os
import numpy as np
from abc import ABCMeta, abstractmethod from abc import ABCMeta, abstractmethod
from collections import deque from collections import deque
import six import six
...@@ -272,7 +273,7 @@ class ScheduledHyperParamSetter(HyperParamSetter): ...@@ -272,7 +273,7 @@ class ScheduledHyperParamSetter(HyperParamSetter):
for p in range(0, self._current_point() + 1): for p in range(0, self._current_point() + 1):
v = self._get_value_to_set_at_point(p) or v v = self._get_value_to_set_at_point(p) or v
actual_value = self.param.get_value() actual_value = self.param.get_value()
if v is not None and v != actual_value: if v is not None and not np.isclose(v, actual_value):
logger.warn("According to scheduler {}, parameter '{}' should become {} at the current point. " logger.warn("According to scheduler {}, parameter '{}' should become {} at the current point. "
"However its current value is {}. " "However its current value is {}. "
"If this is the only scheduler being used, you may want to check whether your " "If this is the only scheduler being used, you may want to check whether your "
......
...@@ -96,8 +96,8 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, ...@@ -96,8 +96,8 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699 Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699
sync_statistics (str or None): one of None, "nccl", or "horovod". sync_statistics (str or None): one of None, "nccl", or "horovod".
By default (None), it uses statistics of the input tensor to normalize. By default (None), it uses statistics of the input tensor to normalize during training.
This is the standard way BatchNorm was done in most frameworks. This is the standard way BatchNorm was implemented in most frameworks.
When set to "nccl", this layer must be used under tensorpack's multi-GPU trainers. When set to "nccl", this layer must be used under tensorpack's multi-GPU trainers.
It uses the aggregated statistics of the whole batch (across all GPUs) to normalize. It uses the aggregated statistics of the whole batch (across all GPUs) to normalize.
...@@ -106,7 +106,7 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, ...@@ -106,7 +106,7 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
It uses the aggregated statistics of the whole batch (across all MPI ranks) to normalize. It uses the aggregated statistics of the whole batch (across all MPI ranks) to normalize.
Note that on single machine this is significantly slower than the "nccl" implementation. Note that on single machine this is significantly slower than the "nccl" implementation.
If not None, per-GPU E[x] and E[x^2] among all GPUs are averaged to compute When enabled, per-GPU E[x] and E[x^2] among all GPUs are averaged to compute
global mean & variance. Therefore each GPU needs to have the same batch size. global mean & variance. Therefore each GPU needs to have the same batch size.
The synchronization is based on the current variable scope + the name of the layer The synchronization is based on the current variable scope + the name of the layer
...@@ -119,7 +119,7 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, ...@@ -119,7 +119,7 @@ def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
If different GPUs execute one BatchNorm layer for different number of times If different GPUs execute one BatchNorm layer for different number of times
(e.g., if some GPUs do not execute it), this layer may hang. (e.g., if some GPUs do not execute it), this layer may hang.
This option only has effect in standard training mode. This option only has effect when `training == get_current_tower_context().training == True`.
This option is also known as "Cross-GPU BatchNorm" as mentioned in: This option is also known as "Cross-GPU BatchNorm" as mentioned in:
`MegDet: A Large Mini-Batch Object Detector <https://arxiv.org/abs/1711.07240>`_. `MegDet: A Large Mini-Batch Object Detector <https://arxiv.org/abs/1711.07240>`_.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment