Commit 23446308 authored by Yuxin Wu's avatar Yuxin Wu

initial commit of FasterRCNN

parent 7c3b404a
# Faster-RCNN on COCO
This example aimes to provide a minimal Multi-GPU implementation (<1000 lines) of ResNet50-Faster-RCNN on COCO.
## Dependencies
+ TensorFlow nightly.
+ Install [pycocotools](https://github.com/pdollar/coco/tree/master/PythonAPI/pycocotools), OpenCV.
+ Pre-trained [ResNet50 model](https://goo.gl/6XjK9V) from tensorpack model zoo.
+ COCO data. It assumes the following directory structure:
```
DIR/
annotations/
instances_train2014.json
instances_val2014.json
instances_minival2014.json
instances_valminusminival2014.json
train2014/
COCO_train2014_*.jpg
val2014/
COCO_val2014_*.jpg
```
`minival` and `valminusminival` are optional. You can download them
[here](https://github.com/rbgirshick/py-faster-rcnn/blob/master/data/README.md).
## Usage
Change `BASEDIR` in `config.py` to `/path/to/DIR` as described above.
To train:
```
./train.py --load /path/to/ImageNet-ResNet50.npz
```
The code is written for training with __8 GPUs__. Otherwise the performance won't be as good.
To predict on an image (and show output in a window):
```
./train.py --predict input.jpg
```
## Results
+ trainval35k/minival, FASTRCNN_BATCH=256: 32.9
+ trainval35k/minival, FASTRCNN_BATCH=64: 31.7. Takes less than one day on 8 Maxwell TitanX.
The hyperparameters are not carefully tuned. You can probably get better performance by e.g. training longer.
## Files
This is an minimal implementation that simply contains these files:
+ coco.py: load COCO data
+ data.py: prepare data for training
+ common.py: some common data preparation utilities
+ basemodel.py: implement resnet
+ model.py: implement faster-rcnn
+ viz.py: visualization utilities
+ utils/: third-party helper functions
+ train.py: main training script
+ eval.py: utilities for evaluation
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: basemodel.py
import tensorflow as tf
from tensorflow.contrib.layers import variance_scaling_initializer
import tensorpack.tfutils.symbolic_functions as symbf
from tensorpack.tfutils.summary import add_moving_summary, add_activation_summary
from tensorpack.tfutils.argscope import argscope, get_arg_scope
from tensorpack.models import (
Conv2D, MaxPooling, BatchNorm, BNReLU, GlobalAvgPooling, FullyConnected)
def image_preprocess(image, bgr=True):
with tf.name_scope('image_preprocess'):
if image.dtype.base_dtype != tf.float32:
image = tf.cast(image, tf.float32)
image = image * (1.0 / 255)
mean = [0.485, 0.456, 0.406] # rgb
std = [0.229, 0.224, 0.225]
if bgr:
mean = mean[::-1]
std = std[::-1]
image_mean = tf.constant(mean, dtype=tf.float32)
image_std = tf.constant(std, dtype=tf.float32)
image = (image - image_mean) / image_std
return image
def get_bn(zero_init=False):
if zero_init:
return lambda x, name: BatchNorm('bn', x, gamma_init=tf.zeros_initializer())
else:
return lambda x, name: BatchNorm('bn', x)
def resnet_shortcut(l, n_out, stride, nl=tf.identity):
data_format = get_arg_scope()['Conv2D']['data_format']
n_in = l.get_shape().as_list()[1 if data_format == 'NCHW' else 3]
if n_in != n_out: # change dimension when channel is not the same
if stride == 2 and 'group3' not in tf.get_variable_scope().name:
l = l[:,:,:-1,:-1]
return Conv2D('convshortcut', l, n_out, 1,
stride=stride, padding='VALID', nl=nl)
else:
return Conv2D('convshortcut', l, n_out, 1,
stride=stride, nl=nl)
else:
return l
def resnet_bottleneck(l, ch_out, stride):
l, shortcut = l, l
l = Conv2D('conv1', l, ch_out, 1, nl=BNReLU)
if stride == 2 and 'group3' not in tf.get_variable_scope().name:
l = tf.pad(l, [[0,0],[0,0],[0,1],[0,1]])
l = Conv2D('conv2', l, ch_out, 3, stride=2, nl=BNReLU, padding='VALID')
else:
l = Conv2D('conv2', l, ch_out, 3, stride=stride, nl=BNReLU)
l = Conv2D('conv3', l, ch_out * 4, 1, nl=get_bn(zero_init=True))
return l + resnet_shortcut(shortcut, ch_out * 4, stride, nl=get_bn(zero_init=False))
def resnet_group(l, name, block_func, features, count, stride):
with tf.variable_scope(name):
for i in range(0, count):
with tf.variable_scope('block{}'.format(i)):
l = block_func(l, features,
stride if i == 0 else 1)
# end of each block need an activation
l = tf.nn.relu(l)
return l
def pretrained_resnet_conv4(image, num_blocks):
assert len(num_blocks) == 3
with argscope([Conv2D, MaxPooling, BatchNorm], data_format='NCHW'), \
argscope(Conv2D, nl=tf.identity, use_bias=False), \
argscope(BatchNorm, use_local_stat=False):
l = tf.pad(image, [[0,0],[0,0],[2,3],[2,3]])
l = Conv2D('conv0', l, 64, 7, stride=2, nl=BNReLU, padding='VALID')
l = tf.pad(l, [[0,0],[0,0],[0,1],[0,1]])
l = MaxPooling('pool0', l, shape=3, stride=2, padding='VALID')
l = resnet_group(l, 'group0', resnet_bottleneck, 64, num_blocks[0], 1)
# TODO replace var by const to enable folding
l = tf.stop_gradient(l)
l = resnet_group(l, 'group1', resnet_bottleneck, 128, num_blocks[1], 2)
l = resnet_group(l, 'group2', resnet_bottleneck, 256, num_blocks[2], 2)
# 16x downsampling up to now
return l
def resnet_conv5(image):
with argscope([Conv2D, GlobalAvgPooling, BatchNorm], data_format='NCHW'), \
argscope(Conv2D, nl=tf.identity, use_bias=False), \
argscope(BatchNorm, use_local_stat=False):
# 14x14:
l = resnet_group(image, 'group3', resnet_bottleneck, 512, 3, stride=2)
l = GlobalAvgPooling('gap', l)
return l
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: coco.py
import numpy as np
import os
import six
from termcolor import colored
from tabulate import tabulate
from tensorpack.dataflow import DataFromList
from tensorpack.utils import logger
from tensorpack.utils.rect import FloatBox
from tensorpack.utils.timer import timed_operation
from pycocotools.coco import COCO
__all__ = ['COCODetection', 'COCOMeta']
COCO_NUM_CATEGORY = 80
class _COCOMeta(object):
INSTANCE_TO_BASEDIR = {
'train2014': 'train2014',
'val2014': 'val2014',
'valminusminival2014': 'val2014',
'minival2014': 'val2014',
'test2014': 'test2014'
}
def valid(self):
return hasattr(self, 'cat_names')
def create(self, cat_ids, cat_names):
"""
cat_ids: list of ids
cat_names: list of names
"""
assert not self.valid()
assert len(cat_ids) == COCO_NUM_CATEGORY and len(cat_names) == COCO_NUM_CATEGORY
self.cat_names = cat_names
self.class_names = ['BG'] + self.cat_names
# background has class id of 0
self.category_id_to_class_id = {
v: i + 1 for i, v in enumerate(cat_ids)}
self.class_id_to_category_id = {
v: k for k, v in self.category_id_to_class_id.items()}
COCOMeta = _COCOMeta()
class COCODetection(object):
def __init__(self, basedir, name):
assert name in COCOMeta.INSTANCE_TO_BASEDIR.keys(), name
self.name = name
self._imgdir = os.path.join(basedir, COCOMeta.INSTANCE_TO_BASEDIR[name])
assert os.path.isdir(self._imgdir), self._imgdir
annotation_file = os.path.join(
basedir, 'annotations/instances_{}.json'.format(name))
assert os.path.isfile(annotation_file), annotation_file
self.coco = COCO(annotation_file)
# initialize the meta
cat_ids = self.coco.getCatIds()
cat_names = [c['name'] for c in self.coco.loadCats(cat_ids)]
if not COCOMeta.valid():
COCOMeta.create(cat_ids, cat_names)
else:
assert COCOMeta.cat_names == cat_names
logger.info("Instances loaded from {}.".format(annotation_file))
def load(self, add_gt=True):
"""
Args:
add_gt: whether to add ground truth annotations to the dicts
Returns:
a list of dict, each has keys including:
height, width, id, file_name,
and (if add_gt is True) boxes, class, is_crowd
"""
with timed_operation('Load Groundtruth Boxes for {}'.format(self.name)):
img_ids = self.coco.getImgIds()
img_ids.sort()
# list of dict, each has keys: height,width,id,file_name
imgs = self.coco.loadImgs(img_ids)
for img in imgs:
self._use_absolute_file_name(img)
if add_gt:
self._add_detection_gt(img)
return imgs
def _use_absolute_file_name(self, img):
"""
Change relative filename to abosolute file name.
"""
img['file_name'] = os.path.join(
self._imgdir, img['file_name'])
assert os.path.isfile(img['file_name']), img['file_name']
def _add_detection_gt(self, img):
"""
Add 'boxes', 'class', 'is_crowd' of this image to the dict, used by detection.
"""
ann_ids = self.coco.getAnnIds(imgIds=img['id'], iscrowd=None)
objs = self.coco.loadAnns(ann_ids)
# clean-up boxes
valid_objs = []
width = img['width']
height = img['height']
for obj in objs:
if obj.get('ignore', 0) == 1:
continue
x1, y1, w, h = obj['bbox']
# bbox is originally in float
# NOTE: assume in data that x1/y1 means upper-left corner and w/h means true w/h
# assume that (0.0, 0.0) is upper-left corner of the first pixel
box = FloatBox(float(x1), float(y1),
float(x1 + w), float(y1 + h))
box.clip_by_shape([height, width])
# Require non-zero seg area and more than 1x1 box size
if obj['area'] > 0 and box.is_box() and box.area() >= 4:
obj['bbox'] = [box.x1, box.y1, box.x2, box.y2]
valid_objs.append(obj)
# all geometrically-valid boxes are returned
boxes = np.asarray([obj['bbox'] for obj in valid_objs], dtype='float32') # (n, 4)
cls = np.asarray([
COCOMeta.category_id_to_class_id[obj['category_id']]
for obj in valid_objs], dtype='int32') # (n,)
is_crowd = np.asarray([obj['iscrowd'] for obj in valid_objs], dtype='int8')
# add the keys
img['boxes'] = boxes # nx4
img['class'] = cls # n, always >0
img['is_crowd'] = is_crowd # n,
def print_class_histogram(self, imgs):
nr_class = len(COCOMeta.class_names)
hist_bins = np.arange(nr_class + 1)
# Histogram of ground-truth objects
gt_hist = np.zeros((nr_class,), dtype=np.int)
for entry in imgs:
# filter crowd?
gt_inds = np.where(
(entry['class'] > 0) & (entry['is_crowd'] == 0))[0]
gt_classes = entry['class'][gt_inds]
gt_hist += np.histogram(gt_classes, bins=hist_bins)[0]
data = [[COCOMeta.class_names[i], v] for i, v in enumerate(gt_hist)]
data.append(['total', sum([x[1] for x in data])])
table = tabulate(data, headers=['class', '#box'], tablefmt='pipe')
logger.info("Ground-Truth Boxes:\n" + colored(table, 'cyan'))
@staticmethod
def load_many(basedir, names, add_gt=True):
"""
Load and merges several instance files together.
"""
if not isinstance(names, (list, tuple)):
names = [names]
ret = []
for n in names:
coco = COCODetection(basedir, n)
ret.extend(coco.load(add_gt))
return ret
if __name__ == '__main__':
c = COCODetection('train')
gt_boxes = c.load()
print("#Images:", len(gt_boxes))
c.print_class_histogram(bb)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: common.py
import numpy as np
import cv2
from tensorpack.dataflow import RNGDataFlow
from tensorpack.dataflow.imgaug import transform
from tensorpack.utils import logger
import config
class DataFromListOfDict(RNGDataFlow):
def __init__(self, lst, keys, shuffle=False):
self._lst = lst
self._keys = keys
self._shuffle = shuffle
self._size = len(lst)
def size(self):
return self._size
def get_data(self):
if self._shuffle:
self.rng.shuffle(self._lst)
for dic in self._lst:
dp = [dic[k] for k in self._keys]
yield dp
class CustomResize(transform.TransformAugmentorBase):
"""
Try resizing the shortest edge to a certain number
while avoiding the longest edge to exceed max_size.
"""
def __init__(self, size, max_size, interp=cv2.INTER_LINEAR):
"""
Args:
size (int): the size to resize the shortest edge to.
max_size (int): maximum allowed longest edge.
"""
self._init(locals())
def _get_augment_params(self, img):
h, w = img.shape[:2]
scale = self.size * 1.0 / min(h, w)
if h < w:
newh, neww = self.size, scale * w
else:
newh, neww = scale * h, self.size
if max(newh, neww) > self.max_size:
scale = self.max_size * 1.0 / max(newh, neww)
newh = newh * scale
neww = neww * scale
neww = int(neww + 0.5)
newh = int(newh + 0.5)
return transform.ResizeTransform(h, w, newh, neww, self.interp)
def box_to_point8(boxes):
"""
Args:
boxes: nx4
Returns:
(nx4)x2
"""
b = boxes[:,[0,1,2,3,0,3,2,1]]
b = b.reshape((-1, 2))
return b
def point8_to_box(points):
"""
Args:
points: (nx4)x2
Returns:
nx4 boxes (x1y1x2y2)
"""
p = points.reshape((-1, 4, 2))
minxy = p.min(axis=1) #nx2
maxxy = p.max(axis=1) #nx2
return np.concatenate((minxy, maxxy), axis=1)
def clip_boxes(boxes, shape):
"""
Args:
boxes: nx4, float
shape: h, w
"""
h, w = shape
boxes[:,[0,1]] = np.maximum(boxes[:,[0,1]], 0)
boxes[:,2] = np.minimum(boxes[:,2], w)
boxes[:,3] = np.minimum(boxes[:,3], h)
return boxes
def print_config():
logger.info("Config: ------------------------------------------")
for k in dir(config):
if k == k.upper():
logger.info("{} = {}".format(k, getattr(config, k)))
logger.info("--------------------------------------------------")
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: config.py
import numpy as np
# dataset -----------------------
BASEDIR = '/path/to/your/COCO/DIR'
TRAIN_DATASET = ['train2014', 'valminusminival2014']
VAL_DATASET = 'minival2014' # only support evaluation on one dataset
NUM_CLASS = 81
# preprocessing --------------------
SHORT_EDGE_SIZE = 600
MAX_SIZE = 1024
# anchors -------------------------
ANCHOR_STRIDE = 16
# sqrtarea of the anchor box
ANCHOR_SIZES = (32, 64, 128, 256, 512)
ANCHOR_RATIOS = (0.5, 1., 2.)
NR_ANCHOR = len(ANCHOR_SIZES) * len(ANCHOR_RATIOS)
POSITIVE_ANCHOR_THRES = 0.7
NEGATIVE_ANCHOR_THRES = 0.3
# rpn training -------------------------
# keep fg ratio in a batch in this range
RPN_FG_RATIO = 0.5
RPN_BATCH_PER_IM = 256
RPN_MIN_SIZE = 0
RPN_PROPOSAL_NMS_THRESH = 0.7
TRAIN_PRE_NMS_TOPK = 12000
TRAIN_POST_NMS_TOPK = 2000
# boxes overlapping crowd will be ignored.
CROWD_OVERLAP_THRES = 0.7
# fastrcnn training ---------------------
FASTRCNN_BATCH_PER_IM = 64
FASTRCNN_BBOX_REG_WEIGHTS = np.array([10, 10, 5, 5], dtype='float32')
FASTRCNN_FG_THRESH = 0.5
# keep fg ratio in a batch in this range
FASTRCNN_FG_RATIO = (0.1, 0.25)
# testing -----------------------
TEST_PRE_NMS_TOPK= 6000
TEST_POST_NMS_TOPK= 1000
FASTRCNN_NMS_THRESH = 0.5
RESULT_SCORE_THRESH = 0.05
RESULTS_PER_IM = 100
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: data.py
import cv2
import os
import numpy as np
import logging
from tensorpack.utils import logger
from tensorpack.utils.argtools import memoized, log_once
from tensorpack.dataflow import (
ProxyDataFlow, MapData, imgaug, TestDataSpeed,
AugmentImageComponents, MapDataComponent)
import tensorpack.utils.viz as tpviz
from tensorpack.utils.viz import interactive_imshow
from coco import COCODetection
from utils.generate_anchors import generate_anchors
from utils.box_ops import get_iou_callable
from common import (
DataFromListOfDict, CustomResize,
box_to_point8, point8_to_box)
import config
class MalformedData(BaseException):
pass
@memoized
def get_all_anchors():
"""
Get all anchors in the largest possible image, shifted, floatbox
Returns:
anchors: SxSxNR_ANCHORx4, where S == MAX_SIZE//STRIDE, floatbox
"""
# Generates a NAx4 matrix of anchor boxes in (x1, y1, x2, y2) format. Anchors
# are centered on stride / 2, have (approximate) sqrt areas of the specified
# sizes, and aspect ratios as given.
cell_anchors = generate_anchors(
config.ANCHOR_STRIDE,
scales=np.array(config.ANCHOR_SIZES, dtype=np.float) / config.ANCHOR_STRIDE,
ratios=np.array(config.ANCHOR_RATIOS, dtype=np.float))
# anchors are intbox here.
# anchors at featuremap [0,0] are centered at fpcoor (8,8) (half of stride)
field_size = config.MAX_SIZE // config.ANCHOR_STRIDE
shifts = np.arange(0, field_size) * config.ANCHOR_STRIDE
shift_x, shift_y = np.meshgrid(shifts, shifts)
shift_x = shift_x.flatten()
shift_y = shift_y.flatten()
shifts = np.vstack((shift_x, shift_y, shift_x, shift_y)).transpose()
# Kx4, K = field_size * field_size
K = shifts.shape[0]
A = cell_anchors.shape[0]
field_of_anchors = (
cell_anchors.reshape((1, A, 4)) +
shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
field_of_anchors = field_of_anchors.reshape((field_size, field_size, A, 4))
# FSxFSxAx4
assert np.all(field_of_anchors == field_of_anchors.astype('int32'))
field_of_anchors = field_of_anchors.astype('float32')
field_of_anchors[:,:,:,[2,3]] += 1
return field_of_anchors
def get_anchor_labels(anchors, gt_boxes, crowd_boxes):
"""
Label each anchor as fg/bg/ignore.
Args:
anchors: Ax4 float
gt_boxes: Bx4 float
crowd_boxes: Cx4 float
Returns:
anchor_labels: (A,) int. Each element is {-1, 0, 1}
anchor_boxes: Ax4. Contains the target gt_box for each anchor when the anchor is fg.
"""
# This function will modify labels and return the filtered inds
def filter_box_label(labels, value, max_num):
curr_inds = np.where(labels == value)[0]
if len(curr_inds) > max_num:
disable_inds = np.random.choice(
curr_inds, size=(len(curr_inds) - max_num),
replace=False)
labels[disable_inds] = -1 # ignore them
curr_inds = np.where(labels == value)[0]
return curr_inds
bbox_iou_float = get_iou_callable()
NA, NB = len(anchors), len(gt_boxes)
assert NB > 0 # empty images should have been filtered already
box_ious = bbox_iou_float(anchors, gt_boxes) # NA x NB
ious_argmax_per_anchor = box_ious.argmax(axis=1) # NA,
ious_max_per_anchor = box_ious.max(axis=1)
ious_max_per_gt = np.amax(box_ious, axis=0, keepdims=True) # 1xNB
# for each gt, find all those anchors (including ties) that has the max ious with it
anchors_with_max_iou_per_gt = np.where(box_ious == ious_max_per_gt)[0]
# Setting NA labels: 1--fg 0--bg -1--ignore
anchor_labels = -np.ones((NA,), dtype='int32') # NA,
# the order of setting neg/pos labels matter
anchor_labels[anchors_with_max_iou_per_gt] = 1
anchor_labels[ious_max_per_anchor >= config.POSITIVE_ANCHOR_THRES] = 1
anchor_labels[ious_max_per_anchor < config.NEGATIVE_ANCHOR_THRES] = 0
# First label all non-ignore candidate boxes which overlap crowd as ignore
if crowd_boxes.size > 0:
cand_inds = np.where(anchor_labels >= 0)[0]
cand_anchors = anchors[cand_inds]
ious = bbox_iou_float(cand_anchors, crowd_boxes)
overlap_with_crowd = cand_inds[ious.max(axis=1) > config.CROWD_OVERLAP_THRES]
anchor_labels[overlap_with_crowd] = -1
# Filter fg labels: ignore some fg if fg is too many
old_num_fg = np.sum(anchor_labels == 1)
target_num_fg = int(config.RPN_BATCH_PER_IM * config.RPN_FG_RATIO)
fg_inds = filter_box_label(anchor_labels, 1, target_num_fg)
# Note that fg could be fewer than the target ratio
# filter bg labels. num_bg is not allowed to be too many
old_num_bg = np.sum(anchor_labels == 0)
if old_num_bg == 0 or len(fg_inds) == 0:
# No valid bg/fg in this image, skip.
# This can happen if, e.g. the image has large crowd.
raise MalformedData("No valid foreground/background for RPN!")
target_num_bg = config.RPN_BATCH_PER_IM - len(fg_inds)
bg_inds = filter_box_label(anchor_labels, 0, target_num_bg)
# Set anchor boxes: the best gt_box for each fg anchor
anchor_boxes = np.zeros((NA, 4), dtype='float32')
fg_boxes = gt_boxes[ious_argmax_per_anchor[fg_inds],:]
anchor_boxes[fg_inds, :] = fg_boxes
return anchor_labels, anchor_boxes
def get_rpn_anchor_input(im, boxes, klass, is_crowd):
"""
Args:
im: an image
boxes: nx4, floatbox, gt. shoudn't be changed
klass: n,
is_crowd: n,
Returns:
The anchor labels and target boxes for each pixel in the featuremap.
fm_labels: fHxfWxNA
fm_boxes: fHxfWxNAx4
"""
boxes = boxes.copy()
ALL_ANCHORS = get_all_anchors()
H, W = im.shape[:2]
featureH, featureW = H // config.ANCHOR_STRIDE, W // config.ANCHOR_STRIDE
def filter_box_inside(im, boxes):
h, w = im.shape[:2]
indices = np.where(
(boxes[:,0] >= 0) &
(boxes[:,1] >= 0) &
(boxes[:,2] <= w) &
(boxes[:,3] <= h))[0]
return indices
crowd_boxes = boxes[is_crowd == 1]
non_crowd_boxes = boxes[is_crowd == 0]
# fHxfWxAx4
featuremap_anchors = ALL_ANCHORS[:featureH,:featureW,:,:]
featuremap_anchors_flatten = featuremap_anchors.reshape((-1, 4))
# only use anchors inside the image
inside_ind = filter_box_inside(im, featuremap_anchors_flatten)
inside_anchors = featuremap_anchors_flatten[inside_ind,:]
anchor_labels, anchor_boxes = get_anchor_labels(inside_anchors, non_crowd_boxes, crowd_boxes)
# Fill them back to original size: fHxfWx1, fHxfWx4
featuremap_labels = -np.ones((featureH * featureW * config.NR_ANCHOR, ), dtype='int32')
featuremap_labels[inside_ind] = anchor_labels
featuremap_labels = featuremap_labels.reshape((featureH, featureW, config.NR_ANCHOR))
featuremap_boxes = np.zeros((featureH * featureW * config.NR_ANCHOR, 4), dtype='float32')
featuremap_boxes[inside_ind, :] = anchor_boxes
featuremap_boxes = featuremap_boxes.reshape((featureH, featureW, config.NR_ANCHOR, 4))
return featuremap_labels, featuremap_boxes
def read_and_augment_images(ds):
def mapf(dp):
fname = dp[0]
im = cv2.imread(fname, cv2.IMREAD_COLOR).astype('float32')
assert im is not None, dp[0]
dp[0] = im
# assume floatbox as input
assert dp[1].dtype == np.float32
dp[1] = box_to_point8(dp[1])
dp.append(fname)
return dp
ds = MapData(ds, mapf)
augs = [CustomResize(config.SHORT_EDGE_SIZE, config.MAX_SIZE),
imgaug.Flip(horiz=True)]
ds = AugmentImageComponents(ds, augs, index=(0,), coords_index=(1,))
def unmapf(points):
boxes = point8_to_box(points)
return boxes
ds = MapDataComponent(ds, unmapf, 1)
return ds
def get_train_dataflow():
imgs = COCODetection.load_many(config.BASEDIR, config.TRAIN_DATASET)
# Valid training images should have at least one fg box.
# But this filter shall not be applied for testing.
imgs = list(filter(lambda img: len(img['boxes']) > 0, imgs)) # log invalid training
ds = DataFromListOfDict(
imgs,
['file_name', 'boxes', 'class', 'is_crowd'], # we need this four keys only
shuffle=True)
ds = read_and_augment_images(ds)
def add_anchor_to_dp(dp):
im, boxes, klass, is_crowd, fname = dp
try:
fm_labels, fm_boxes = get_rpn_anchor_input(im, boxes, klass, is_crowd)
boxes = boxes[is_crowd == 0] # skip crowd boxes in training target
klass = klass[is_crowd == 0]
if not len(boxes):
raise MalformedData("No valid gt_boxes!")
except MalformedData as e:
log_once("Input {} is invalid for training: {}".format(fname, str(e)), 'warn')
return None
return [im, fm_labels, fm_boxes, boxes, klass]
ds = MapData(ds, add_anchor_to_dp)
return ds
def get_eval_dataflow():
imgs = COCODetection.load_many(config.BASEDIR, config.VAL_DATASET, add_gt=False)
# no filter for training
ds = DataFromListOfDict(imgs, ['file_name', 'id'])
def f(fname):
im = cv2.imread(fname, cv2.IMREAD_COLOR)
assert im is not None, fname
return im
ds = MapDataComponent(ds, f, 0)
return ds
if __name__ == '__main__':
#logger.setLevel(logging.DEBUG)
from tensorpack.dataflow import PrintData
ds = get_train_dataflow('/datasets01/COCO/060817')
ds = PrintData(ds, 100)
TestDataSpeed(ds, 50000).start()
ds.reset_state()
for k in ds.get_data():
pass
#import IPython as IP; IP.embed()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: eval.py
import numpy as np
import tqdm
import cv2
import os
from collections import namedtuple
import tensorflow as tf
from tensorpack.dataflow import MapDataComponent, TestDataSpeed
from tensorpack.tfutils import get_default_sess_config
from tensorpack.utils.argtools import memoized
from tensorpack.utils.utils import get_tqdm_kwargs
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from coco import COCODetection, COCOMeta
from common import clip_boxes, DataFromListOfDict, CustomResize
import config
DetectionResult = namedtuple(
'DetectionResult',
['class_id', 'boxes', 'scores'])
@memoized
def get_tf_nms():
"""
Get a NMS callable.
"""
boxes = tf.placeholder(tf.float32, shape=[None, 4])
scores = tf.placeholder(tf.float32, shape=[None])
indices = tf.image.non_max_suppression(
boxes, scores,
config.RESULTS_PER_IM, config.FASTRCNN_NMS_THRESH)
sess = tf.Session(config=get_default_sess_config())
return sess.make_callable(indices, [boxes, scores])
def nms_fastrcnn_results(boxes, probs):
"""
Args:
boxes: nx4 floatbox in float32
probs: nxC
Returns:
[DetectionResult]
"""
C = probs.shape[1]
boxes = boxes.copy()
boxes_per_class = {}
nms_func = get_tf_nms()
ret = []
for klass in range(1, C):
ids = np.where(probs[:, klass] > config.RESULT_SCORE_THRESH)[0]
if ids.size == 0:
continue
probs_k = probs[ids, klass].flatten()
boxes_k = boxes[ids,:]
selected_ids = nms_func(boxes_k[:,[1,0,3,2]], probs_k)
selected_boxes = boxes_k[selected_ids, :].copy()
ret.append(DetectionResult(klass, selected_boxes, probs_k[selected_ids]))
if len(ret):
newret = []
all_scores = np.hstack([x.scores for x in ret])
if len(all_scores) > config.RESULTS_PER_IM:
score_thresh = np.sort(all_scores)[-config.RESULTS_PER_IM]
for klass, boxes, scores in ret:
keep_ids = np.where(scores >= score_thresh)[0]
if len(keep_ids):
newret.append(DetectionResult(
klass, boxes[keep_ids,:], scores[keep_ids]))
ret = newret
return ret
def detect_one_image(img, model_func):
"""
Run detection on one image, using the TF callable.
This function should handle the preprocessing internally.
Args:
img: an image
model_func: a callable from TF model, takes [image] and returns (probs, boxes)
Returns:
[DetectionResult]
"""
resizer = CustomResize(config.SHORT_EDGE_SIZE, config.MAX_SIZE)
resized_img = resizer.augment(img)
scale = (resized_img.shape[0] * 1.0 / img.shape[0] + resized_img.shape[1] * 1.0 / img.shape[1]) / 2
fg_probs, fg_boxes = model_func([resized_img])
fg_boxes = fg_boxes / scale
fg_boxes = clip_boxes(fg_boxes, img.shape[:2])
return nms_fastrcnn_results(fg_boxes, fg_probs)
def eval_on_dataflow(df, detect_func):
"""
Args:
df: a DataFlow which produces (image, image_id)
detect_func: a callable, takes [image] and returns a dict
Returns:
list of dict, to be dumped to COCO json format
"""
df.reset_state()
all_results = []
with tqdm.tqdm(total=df.size(), **get_tqdm_kwargs()) as pbar:
for img, img_id in df.get_data():
results = detect_func(img)
for classid, boxes, scores in results:
cat_id = COCOMeta.class_id_to_category_id[classid]
boxes[:,2] -= boxes[:,0]
boxes[:,3] -= boxes[:,1]
for box, score in zip(boxes, scores):
all_results.append({
'image_id': img_id,
'category_id': cat_id,
'bbox': list(map(lambda x: float(round(x, 1)), box)),
'score': float(round(score, 2)),
})
pbar.update(1)
return all_results
# https://github.com/pdollar/coco/blob/master/PythonAPI/pycocoEvalDemo.ipynb
def print_evaluation_scores(json_file):
assert config.BASEDIR and os.path.isdir(config.BASEDIR)
annofile = os.path.join(
config.BASEDIR, 'annotations',
'instances_{}.json'.format(config.VAL_DATASET))
coco = COCO(annofile)
cocoDt = coco.loadRes(json_file)
imgIds = sorted(coco.getImgIds())
cocoEval = COCOeval(coco, cocoDt, 'bbox')
cocoEval.params.imgIds = imgIds
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()
if __name__ == '__main__':
ds = get_eval_dataflow('/home/yuxinwu/data/COCO/')
print("Size: ", ds.size())
TestDataSpeed(ds, 1000).start()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: model.py
import numpy as np
import tensorflow as tf
from tensorpack.tfutils import get_current_tower_context
from tensorpack.tfutils.summary import add_moving_summary
from tensorpack.tfutils.argscope import argscope
from tensorpack.tfutils.scope_utils import under_name_scope
from tensorpack.models import Conv2D, FullyConnected
from utils.box_ops import pairwise_iou
import config
def rpn_head(featuremap):
with tf.variable_scope('rpn'), \
argscope(Conv2D, data_format='NCHW',
W_init=tf.random_normal_initializer(stddev=0.01)):
hidden = Conv2D('conv0', featuremap, 1024, 3, nl=tf.nn.relu)
label_logits = Conv2D('class', hidden, config.NR_ANCHOR, 1)
box_logits = Conv2D('box', hidden, 4 * config.NR_ANCHOR, 1)
# 1, NA(*4), im/16, im/16 (NCHW)
label_logits = tf.transpose(label_logits, [0, 2, 3, 1]) # 1xfHxfWxNA
label_logits = tf.squeeze(label_logits, 0) # fHxfWxNA
shp = tf.shape(box_logits) # 1x(NAx4)xfHxfW
box_logits = tf.transpose(box_logits, [0, 2, 3, 1]) # 1xfHxfWx(NAx4)
box_logits = tf.reshape(box_logits, tf.stack([shp[2], shp[3], config.NR_ANCHOR, 4])) # fHxfWxNAx4
return label_logits, box_logits
@under_name_scope()
def rpn_losses(anchor_labels, anchor_boxes, label_logits, box_logits):
"""
Args:
anchor_labels: fHxfWxNA
anchor_boxes: fHxfWxNAx4, encoded
label_logits: fHxfWxNA
box_logits: fHxfWxNAx4
Returns:
label_loss, box_loss
"""
with tf.device('/cpu:0'):
valid_mask = tf.stop_gradient(tf.not_equal(anchor_labels, -1))
pos_mask = tf.stop_gradient(tf.equal(anchor_labels, 1))
nr_valid = tf.stop_gradient(tf.count_nonzero(valid_mask), name='num_valid_anchor')
nr_pos = tf.count_nonzero(pos_mask, name='num_pos_anchor')
valid_anchor_labels = tf.boolean_mask(anchor_labels, valid_mask)
valid_label_logits = tf.boolean_mask(label_logits, valid_mask)
with tf.name_scope('label_metrics'):
valid_label_prob = tf.nn.sigmoid(valid_label_logits)
summaries = []
with tf.device('/cpu:0'):
for th in [0.5, 0.2, 0.1]:
valid_prediction = tf.cast(valid_label_prob > th, tf.int32)
prediction_corr = tf.count_nonzero(tf.equal(valid_prediction, valid_anchor_labels))
pos_prediction_corr = tf.count_nonzero(tf.logical_and(
valid_label_prob > th,
tf.equal(valid_prediction, valid_anchor_labels)))
summaries.append(tf.truediv(
pos_prediction_corr,
nr_pos, name='recall_th{}'.format(th)))
summaries.append(tf.truediv(
prediction_corr,
nr_valid, name='accuracy_th{}'.format(th)))
label_loss = tf.nn.sigmoid_cross_entropy_with_logits(
labels=tf.to_float(valid_anchor_labels), logits=valid_label_logits)
label_loss = tf.reduce_mean(label_loss, name='label_loss')
pos_anchor_boxes = tf.boolean_mask(anchor_boxes, pos_mask)
pos_box_logits = tf.boolean_mask(box_logits, pos_mask)
delta = 1.0 / 9
box_loss = tf.losses.huber_loss(
pos_anchor_boxes, pos_box_logits, delta=delta,
reduction=tf.losses.Reduction.SUM) / delta
box_loss = tf.div(
box_loss,
tf.cast(nr_valid, tf.float32), name='box_loss')
for k in [label_loss, box_loss, nr_valid, nr_pos] + summaries:
add_moving_summary(k)
return label_loss, box_loss
@under_name_scope()
def decode_bbox_target(box_predictions, anchors):
"""
Args:
box_predictions: fHxfWxNAx4, logits
anchors: fHxfWxNAx4, floatbox
Returns:
box_decoded: (fHxfWxNA)x4, float32
"""
box_pred_txtytwth = tf.reshape(box_predictions, (-1, 2, 2))
box_pred_txty, box_pred_twth = tf.split(box_pred_txtytwth, 2, axis=1)
# each is (fHxfWxNA)x1x2
anchors_x1y1x2y2 = tf.reshape(anchors, (-1, 2, 2))
anchors_x1y1, anchors_x2y2 = tf.split(anchors_x1y1x2y2, 2, axis=1)
waha = tf.to_float(anchors_x2y2 - anchors_x1y1)
xaya = tf.to_float(anchors_x2y2 + anchors_x1y1) * 0.5
wbhb = tf.exp(tf.minimum(
box_pred_twth, np.log(config.MAX_SIZE * 1.0 / config.ANCHOR_STRIDE))) * waha
xbyb = box_pred_txty * waha + xaya
x1y1 = xbyb - wbhb * 0.5
x2y2 = xbyb + wbhb * 0.5
out = tf.squeeze(tf.concat([x1y1, x2y2], axis=2), axis=1, name='output')
return out
@under_name_scope()
def encode_bbox_target(boxes, anchors):
"""
Args:
boxes: fHxfWxNAx4, float32
anchors: fHxfWxNAx4, float32
Returns:
box_encoded: fHxfWxNAx4
"""
anchors_x1y1x2y2 = tf.reshape(anchors, (-1, 2, 2))
anchors_x1y1, anchors_x2y2 = tf.split(anchors_x1y1x2y2, 2, axis=1)
waha = tf.to_float(anchors_x2y2 - anchors_x1y1)
xaya = tf.to_float(anchors_x2y2 + anchors_x1y1) * 0.5
boxes_x1y1x2y2 = tf.reshape(boxes, (-1, 2, 2))
boxes_x1y1, boxes_x2y2 = tf.split(boxes_x1y1x2y2, 2, axis=1)
wbhb = tf.to_float(boxes_x2y2 - boxes_x1y1)
xbyb = tf.to_float(boxes_x2y2 + boxes_x1y1) * 0.5
# Note that here not all boxes are valid. Some may be zero
txty = (xbyb - xaya) / waha
twth = tf.log(wbhb / waha) # may contain -inf for invalid boxes
encoded = tf.concat([txty, twth], axis=1) # (-1x2x2)
return tf.reshape(encoded, tf.shape(boxes))
@under_name_scope()
def generate_rpn_proposals(boxes, scores, img_shape):
"""
Args:
boxes: nx4 float dtype, decoded to floatbox already
scores: n float, the logits
img_shape: [h, w]
Returns:
boxes: kx4 float
scores: k logits
"""
if get_current_tower_context().is_training:
PRE_NMS_TOPK = config.TRAIN_PRE_NMS_TOPK
POST_NMS_TOPK = config.TRAIN_POST_NMS_TOPK
else:
PRE_NMS_TOPK = config.TEST_PRE_NMS_TOPK
POST_NMS_TOPK = config.TEST_POST_NMS_TOPK
@under_name_scope()
def clip_boxes(boxes, window):
boxes = tf.maximum(boxes, 0.0)
m = tf.tile(tf.reverse(window, [0]), [2]) # (4,)
boxes = tf.minimum(boxes, tf.to_float(m))
return boxes
topk = tf.minimum(PRE_NMS_TOPK, tf.size(scores))
topk_scores, topk_indices = tf.nn.top_k(scores, k=topk, sorted=False)
topk_boxes = tf.gather(boxes, topk_indices)
topk_boxes = clip_boxes(topk_boxes, img_shape)
topk_boxes_x1y1x2y2 = tf.reshape(topk_boxes, (-1, 2, 2))
topk_boxes_x1y1, topk_boxes_x2y2 = tf.split(topk_boxes_x1y1x2y2, 2, axis=1)
# nx1x2 each
wbhb = tf.squeeze(topk_boxes_x2y2 - topk_boxes_x1y1, axis=1)
valid = tf.reduce_all(wbhb > config.RPN_MIN_SIZE, axis=1) #n,
topk_valid_boxes_x1y1x2y2 = tf.boolean_mask(topk_boxes_x1y1x2y2, valid)
topk_valid_scores = tf.boolean_mask(topk_scores, valid)
topk_valid_boxes_y1x1y2x2 = tf.reshape(
tf.reverse(topk_valid_boxes_x1y1x2y2, axis=[2]),
(-1, 4), name='nms_input_boxes')
nms_indices = tf.image.non_max_suppression(
topk_valid_boxes_y1x1y2x2,
topk_valid_scores,
max_output_size=POST_NMS_TOPK,
iou_threshold=config.RPN_PROPOSAL_NMS_THRESH)
topk_valid_boxes = tf.reshape(topk_valid_boxes_x1y1x2y2, (-1, 4))
final_boxes = tf.gather(
topk_valid_boxes,
nms_indices, name='boxes')
final_scores = tf.gather(topk_valid_scores, nms_indices, name='scores')
final_probs = tf.gather(topk_valid_scores, nms_indices, name='probs')
return final_boxes, final_scores
@under_name_scope()
def sample_fast_rcnn_targets(boxes, gt_boxes, gt_labels):
"""
Args:
boxes: nx4 region proposals, floatbox
gt_boxes: mx4, floatbox
gt_labels: m, int32
Returns:
sampled_boxes: tx4 floatbox, the rois
target_boxes: tx4 encoded box, the regression target
labels: t labels
"""
@under_name_scope()
def assign_class_to_roi(iou, gt_boxes, gt_labels):
"""
Args:
iou: nxm (nr_proposal x nr_gt)
Returns:
fg_mask: n boolean, whether each roibox is fg
roi_labels: n int32, best label for each roi box
best_gt_boxes: nx4
"""
# find best gt box for each roi box
best_iou_ind = tf.argmax(iou, axis=1) # n, each in 1~m
best_iou = tf.reduce_max(iou, axis=1) # n,
best_gt_boxes = tf.gather(gt_boxes, best_iou_ind) #nx4
best_gt_labels = tf.gather(gt_labels, best_iou_ind) # n, each in 1~C
fg_mask = best_iou >= config.FASTRCNN_FG_THRESH
return fg_mask, best_gt_labels, best_gt_boxes
iou = pairwise_iou(boxes, gt_boxes) # nxm
with tf.name_scope('proposal_metrics'):
# find best roi for each gt, for summary only
best_iou = tf.reduce_max(iou, axis=0)
mean_best_iou = tf.reduce_mean(best_iou, name='best_iou_per_gt')
summaries = [mean_best_iou]
with tf.device('/cpu:0'):
for th in [0.3, 0.5]:
recall = tf.truediv(
tf.count_nonzero(best_iou >= th),
tf.size(best_iou, out_type=tf.int64),
name='recall_iou{}'.format(th))
summaries.append(recall)
add_moving_summary(*summaries)
# n, n, nx4
fg_mask, roi_labels, best_gt_boxes = assign_class_to_roi(iou, gt_boxes, gt_labels)
# don't have to add gt for training, but add it anyway
fg_inds = tf.reshape(tf.where(fg_mask), [-1])
fg_inds = tf.concat([fg_inds,
tf.cast(
tf.range(tf.size(gt_labels)) + tf.shape(boxes)[0],
tf.int64)], 0)
num_fg = tf.size(fg_inds)
num_fg = tf.minimum(int(
config.FASTRCNN_BATCH_PER_IM * config.FASTRCNN_FG_RATIO[1]),
num_fg, name='num_fg')
fg_inds = tf.slice(tf.random_shuffle(fg_inds), [0], [num_fg])
bg_inds = tf.where(tf.logical_not(fg_mask))[:,0]
num_bg = tf.size(bg_inds)
num_bg = tf.minimum(config.FASTRCNN_BATCH_PER_IM - num_fg, num_bg)
num_bg = tf.minimum(
num_bg,
num_fg * int(1.0 / config.FASTRCNN_FG_RATIO[0]), name='num_bg') # don't include too many bg
bg_inds = tf.slice(tf.random_shuffle(bg_inds), [0], [num_bg])
add_moving_summary(num_fg, num_bg)
all_boxes = tf.concat([boxes, gt_boxes], axis=0)
all_matched_gt_boxes = tf.concat([best_gt_boxes, gt_boxes], axis=0)
all_labels = tf.concat([roi_labels, gt_labels], axis=0)
ind_in_all = tf.concat([fg_inds, bg_inds], axis=0) # ind in all n+m boxes
ret_boxes = tf.gather(all_boxes, ind_in_all, name='sampled_boxes')
ret_matched_gt_boxes = tf.gather(all_matched_gt_boxes, ind_in_all)
ret_encoded_boxes = encode_bbox_target(ret_matched_gt_boxes, ret_boxes)
ret_encoded_boxes = ret_encoded_boxes * tf.constant(config.FASTRCNN_BBOX_REG_WEIGHTS)
# bg boxes will not be trained on
ret_labels = tf.concat(
[tf.gather(all_labels, fg_inds),
tf.zeros_like(bg_inds, dtype=tf.int64)], axis=0, name='sampled_labels')
return ret_boxes, tf.stop_gradient(ret_encoded_boxes), tf.stop_gradient(ret_labels)
@under_name_scope()
def roi_align(featuremap, boxes, output_shape):
"""
Args:
featuremap: 1xCxHxW
boxes: Nx4 floatbox
output_shape: int
Returns:
NxCxoHxoW
"""
@under_name_scope()
def transform_fpcoor_for_tf(boxes, image_shape, crop_shape):
"""
The way crop_and_resize works (with normalized box):
Initial point (the value of output[0]): x0_box * (W_img - 1)
Spacing: w_box * (W_img - 1) / (W_crop - 1)
Use the above grid to bilinear sample.
However, what I want is (with fpcoor box):
Spacing: w_box / W_crop
Initial point: x0_box + spacing/2 - 0.5
(-0.5 because bilinear sample assumes floating point coordinate (0.0, 0.0) is the same as pixel value (0, 0))
This function transform fpcoor boxes to a format to be used by tf.image.crop_and_resize
Returns:
y1x1y2x2
"""
x0, y0, x1, y1 = tf.split(boxes, 4, axis=1)
spacing_w = (x1 - x0) / tf.to_float(crop_shape[1])
spacing_h = (y1 - y0) / tf.to_float(crop_shape[0])
nx0 = (x0 + spacing_w / 2 - 0.5) / tf.to_float(image_shape[1] - 1)
ny0 = (y0 + spacing_h / 2 - 0.5) / tf.to_float(image_shape[0] - 1)
nw = spacing_w * tf.to_float(crop_shape[1] - 1) / tf.to_float(image_shape[1] - 1)
nh = spacing_h * tf.to_float(crop_shape[0] - 1) / tf.to_float(image_shape[0] - 1)
return tf.concat([ny0, nx0, ny0 + nh, nx0 + nw], axis=1)
image_shape = tf.shape(featuremap)[2:]
featuremap = tf.transpose(featuremap, [0, 2, 3, 1]) # to nhwc
# sample 4 locations per roi bin
boxes = transform_fpcoor_for_tf(boxes, image_shape, [output_shape * 2, output_shape * 2])
boxes = tf.stop_gradient(boxes) # TODO
ret = tf.image.crop_and_resize(
featuremap, boxes, tf.zeros([tf.shape(boxes)[0]], dtype=tf.int32),
crop_size=[output_shape * 2, output_shape * 2])
ret = tf.transpose(ret, [0, 3, 1, 2])
ret = tf.nn.avg_pool(ret, [1, 1, 2, 2], [1, 1, 2, 2], padding='SAME', data_format='NCHW')
return ret
def fastrcnn_head(feature, num_classes):
"""
Args:
feature (NxCx1x1):
num_classes(int): num_category + 1
Returns:
cls_logits (Nxnum_class), reg_logits (Nx num_class-1 x 4)
"""
with tf.variable_scope('fastrcnn'):
classification = FullyConnected(
'class', feature, num_classes,
W_init=tf.random_normal_initializer(stddev=0.01))
box_regression = FullyConnected(
'box', feature, (num_classes - 1) * 4,
W_init=tf.random_normal_initializer(stddev=0.001))
box_regression = tf.reshape(box_regression, (-1, num_classes - 1, 4))
return classification, box_regression
@under_name_scope()
def fastrcnn_predict_boxes(labels, box_logits):
"""
Args:
labels: n,
box_logits: nx(C-1)x4
Returns:
fg_ind: fg, indices into n
fg_box_logits: fgx4
"""
fg_ind = tf.reshape(tf.where(labels > 0), [-1]) # nfg,
fg_labels = tf.gather(labels, fg_ind) # nfg,
ind_2d = tf.stack([fg_ind, fg_labels - 1], axis=1) # nfgx2
# n x c-1 x 4 -> nfgx4
fg_box_logits = tf.gather_nd(box_logits, tf.stop_gradient(ind_2d))
return fg_ind, fg_box_logits
@under_name_scope()
def fastrcnn_losses(labels, boxes, label_logits, box_logits):
"""
Args:
labels: n,
boxes: nx4, encoded
label_logits: nxC
box_logits: nx(C-1)x4
"""
label_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=labels, logits=label_logits)
label_loss = tf.reduce_mean(label_loss, name='label_loss')
prediction = tf.argmax(label_logits, axis=1, name='label_prediction')
correct = tf.to_float(tf.equal(prediction, labels)) # boolean/integer gather is unavailable on GPU
accuracy = tf.reduce_mean(correct, name='accuracy')
# n x c-1 x 4 -> nfg x 4
fg_ind, fg_box_logits = fastrcnn_predict_boxes(labels, box_logits)
fg_boxes = tf.gather(boxes, fg_ind) # nfgx4
fg_label_pred = tf.argmax(tf.gather(label_logits, fg_ind), axis=1)
num_zero = tf.reduce_sum(tf.cast(tf.equal(fg_label_pred, 0), tf.int32), name='num_zero')
false_negative = tf.truediv(num_zero, tf.size(fg_ind), name='false_negative')
fg_correct = tf.gather(correct, fg_ind)
fg_accuracy = tf.reduce_mean(fg_correct, name='fg_accuracy')
box_loss = tf.losses.huber_loss(
fg_boxes, fg_box_logits, reduction=tf.losses.Reduction.SUM)
box_loss = tf.truediv(
box_loss, tf.to_float(tf.shape(labels)[0]), name='box_loss')
for k in [label_loss, box_loss, accuracy, fg_accuracy, false_negative]:
add_moving_summary(k)
return label_loss, box_loss
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: train.py
import sys, os
import argparse
import cv2
import shutil
import itertools
import tqdm
import math
import numpy as np
import json
import tensorflow as tf
from tensorpack import *
import tensorpack.tfutils.symbolic_functions as symbf
from tensorpack.tfutils.summary import add_moving_summary
from tensorpack.tfutils import optimizer, gradproc
import tensorpack.utils.viz as tpviz
from tensorpack.utils.concurrency import subproc_call
from tensorpack.utils.gpu import get_nr_gpu
from coco import COCODetection
from basemodel import (
image_preprocess, pretrained_resnet_conv4, resnet_conv5)
from model import (
rpn_head, rpn_losses,
decode_bbox_target, encode_bbox_target,
generate_rpn_proposals, sample_fast_rcnn_targets,
roi_align, fastrcnn_head, fastrcnn_losses, fastrcnn_predict_boxes)
from data import (
get_train_dataflow, get_eval_dataflow,
get_all_anchors)
from viz import (
draw_annotation, draw_proposal_recall,
draw_predictions, draw_final_outputs)
from common import clip_boxes, CustomResize, print_config
from eval import (
eval_on_dataflow, detect_one_image, print_evaluation_scores, get_tf_nms,
nms_fastrcnn_results)
import config
class Model(ModelDesc):
def _get_inputs(self):
return [
InputDesc(tf.float32, (None, None, 3), 'image'),
InputDesc(tf.int32, (None, None, config.NR_ANCHOR), 'anchor_labels'),
InputDesc(tf.float32, (None, None, config.NR_ANCHOR, 4), 'anchor_boxes'),
InputDesc(tf.float32, (None, 4), 'gt_boxes'),
InputDesc(tf.int64, (None,), 'gt_labels'),
]
def _build_graph(self, inputs):
is_training = get_current_tower_context().is_training
image, anchor_labels, anchor_boxes, gt_boxes, gt_labels = inputs
image = tf.expand_dims(image, 0)
# FSxFSxNAx4 (FS=MAX_SIZE//ANCHOR_STRIDE)
with tf.name_scope('anchors'):
all_anchors = tf.constant(get_all_anchors(), name='all_anchors', dtype=tf.float32)
fm_anchors = tf.slice(
all_anchors, [0, 0, 0, 0], tf.stack([
tf.shape(image)[1] // config.ANCHOR_STRIDE,
tf.shape(image)[2] // config.ANCHOR_STRIDE,
-1, -1]), name='fm_anchors')
anchor_boxes_encoded = encode_bbox_target(anchor_boxes, fm_anchors)
image = image_preprocess(image, bgr=True)
image = tf.transpose(image, [0, 3, 1, 2])
# resnet50
featuremap = pretrained_resnet_conv4(image, [3, 4, 6])
rpn_label_logits, rpn_box_logits = rpn_head(featuremap)
rpn_label_loss, rpn_box_loss = rpn_losses(
anchor_labels, anchor_boxes_encoded, rpn_label_logits, rpn_box_logits)
decoded_boxes = decode_bbox_target(rpn_box_logits, fm_anchors) # (fHxfWxNA)x4, floatbox
proposal_boxes, proposal_scores = generate_rpn_proposals(
decoded_boxes,
tf.reshape(rpn_label_logits, [-1]),
tf.shape(image)[2:])
if is_training:
rcnn_sampled_boxes, rcnn_encoded_boxes, rcnn_labels = sample_fast_rcnn_targets(
proposal_boxes, gt_boxes, gt_labels)
boxes_on_featuremap = rcnn_sampled_boxes * (1.0 / config.ANCHOR_STRIDE)
roi_resized = roi_align(featuremap, boxes_on_featuremap, 14)
feature_fastrcnn = resnet_conv5(roi_resized) #nxc
fastrcnn_label_logits, fastrcnn_box_logits = fastrcnn_head(feature_fastrcnn, config.NUM_CLASS)
fastrcnn_label_loss, fastrcnn_box_loss = fastrcnn_losses(
rcnn_labels, rcnn_encoded_boxes, fastrcnn_label_logits, fastrcnn_box_logits)
wd_cost = regularize_cost(
'(?:group1|group2|group3|rpn|fastrcnn)/.*W',
l2_regularizer(1e-4), name='wd_cost')
self.cost = tf.add_n([
rpn_label_loss, rpn_box_loss,
fastrcnn_label_loss, fastrcnn_box_loss,
wd_cost], 'total_cost')
for k in self.cost, wd_cost:
add_moving_summary(k)
else:
roi_resized = roi_align(featuremap, proposal_boxes * (1.0 / config.ANCHOR_STRIDE), 14)
feature_fastrcnn = resnet_conv5(roi_resized) #nxc
label_logits, fastrcnn_box_logits = fastrcnn_head(feature_fastrcnn, config.NUM_CLASS)
label_probs = tf.nn.softmax(label_logits, name='fastrcnn_all_probs') # NP,
labels = tf.argmax(label_logits, axis=1)
fg_ind, fg_box_logits = fastrcnn_predict_boxes(labels, fastrcnn_box_logits)
fg_label_probs = tf.gather(label_probs, fg_ind, name='fastrcnn_fg_probs')
fg_boxes = tf.gather(proposal_boxes, fg_ind)
fg_box_logits = fg_box_logits / tf.constant(config.FASTRCNN_BBOX_REG_WEIGHTS)
decoded_boxes = decode_bbox_target(fg_box_logits, fg_boxes) # Nfx4, floatbox
decoded_boxes = tf.identity(decoded_boxes, name='fastrcnn_fg_boxes')
def _get_optimizer(self):
lr = symbf.get_scalar_var('learning_rate', 0.003, summary=True)
opt = tf.train.MomentumOptimizer(lr, 0.9)
return optimizer.apply_grad_processors(
opt, [gradproc.ScaleGradient(('.*/b', 2))])
def visualize(model_path, nr_visualize=50, output_dir='output'):
pred = OfflinePredictor(PredictConfig(
model=Model(),
session_init=get_model_loader(model_path),
input_names=['image', 'gt_boxes', 'gt_labels'],
output_names=[
'generate_rpn_proposals/boxes',
'generate_rpn_proposals/probs',
'fastrcnn_all_probs',
'fastrcnn_fg_probs',
'fastrcnn_fg_boxes',
]))
df = get_train_dataflow()
df.reset_state()
if os.path.isdir(output_dir):
shutil.rmtree(output_dir)
utils.fs.mkdir_p(output_dir)
with tqdm.tqdm(total=nr_visualize) as pbar:
for idx, dp in itertools.islice(enumerate(df.get_data()), nr_visualize):
img, _, _, gt_boxes, gt_labels = dp
rpn_boxes, rpn_scores, all_probs, fg_probs, fg_boxes = pred(img, gt_boxes, gt_labels)
gt_viz = draw_annotation(img, gt_boxes, gt_labels)
proposal_viz, good_proposals_ind = draw_proposal_recall(img, rpn_boxes, rpn_scores, gt_boxes)
score_viz = draw_predictions(img, rpn_boxes[good_proposals_ind], all_probs[good_proposals_ind])
fg_boxes = clip_boxes(fg_boxes, img.shape[:2])
fg_viz = draw_predictions(img, fg_boxes, fg_probs)
results = nms_fastrcnn_results(fg_boxes, fg_probs)
final_viz = draw_final_outputs(img, results)
viz = tpviz.stack_patches([
gt_viz, proposal_viz, score_viz,
fg_viz, final_viz], 2, 3)
if os.environ.get('DISPLAY', None):
tpviz.interactive_imshow(viz)
cv2.imwrite("{}/{:03d}.png".format(output_dir, idx), viz)
pbar.update()
def offline_evaluate(model_path, output_file):
pred = OfflinePredictor(PredictConfig(
model=Model(),
session_init=get_model_loader(model_path),
input_names=['image'],
output_names=[
'fastrcnn_fg_probs',
'fastrcnn_fg_boxes',
]))
df = get_eval_dataflow()
df = PrefetchDataZMQ(df, 1)
all_results = eval_on_dataflow(df, lambda img: detect_one_image(img, pred))
with open(output_file, 'w') as f:
json.dump(all_results, f)
print_evaluation_scores(output_file)
def predict(model_path, input_file):
pred = OfflinePredictor(PredictConfig(
model=Model(),
session_init=get_model_loader(model_path),
input_names=['image'],
output_names=[
'fastrcnn_fg_probs',
'fastrcnn_fg_boxes',
]))
img = cv2.imread(input_file, cv2.IMREAD_COLOR)
results = detect_one_image(img, pred)
final = draw_final_outputs(img, results)
viz = np.concatenate((img, final), axis=1)
tpviz.interactive_imshow(viz)
class EvalCallback(Callback):
def _setup_graph(self):
self.pred = self.trainer.get_predictor(['image'], ['fastrcnn_fg_probs', 'fastrcnn_fg_boxes'])
self.df = PrefetchDataZMQ(get_eval_dataflow(), 1)
EVAL_TIMES = 5 # eval 5 times during training
interval = self.trainer.config.max_epoch // (EVAL_TIMES + 1)
self.epochs_to_eval = set([interval * k for k in range(1, EVAL_TIMES)])
self.epochs_to_eval.add(self.trainer.config.max_epoch)
get_tf_nms() # just to make sure the nms part of graph is created
def _eval(self):
all_results = eval_on_dataflow(self.df, lambda img: detect_one_image(img, self.pred))
output_file = os.path.join(
logger.LOG_DIR, 'outputs{}.json'.format(self.global_step))
with open(output_file, 'w') as f:
json.dump(all_results, f)
print_evaluation_scores(output_file)
def _trigger_epoch(self):
if self.epoch_num in self.epochs_to_eval:
self._eval()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
parser.add_argument('--load', help='load model')
parser.add_argument('--logdir', help='logdir', default='train_log/fastrcnn')
parser.add_argument('--datadir', help='override config.BASEDIR')
parser.add_argument('--visualize', action='store_true')
parser.add_argument('--evaluate', help='path to the output json eval file')
parser.add_argument('--predict', help='path to the input image file')
args = parser.parse_args()
if args.datadir:
config.BASEDIR = args.datadir
print_config()
if args.gpu:
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
nr_gpu = get_nr_gpu()
if args.visualize:
assert args.load
visualize(args.load)
sys.exit()
if args.evaluate is not None:
assert args.evaluate.endswith('.json')
assert args.load
offline_evaluate(args.load, args.evaluate)
sys.exit()
if args.predict is not None:
COCODetection(config.BASEDIR, 'train2014') # to load the class names
assert args.load
predict(args.load, args.predict)
sys.exit()
logger.set_logger_dir(args.logdir, 'd')
stepnum = 300
warmup_epoch = max(math.ceil(500.0 / stepnum), 5)
cfg = TrainConfig(
model=Model(),
dataflow=get_train_dataflow(),
callbacks=[
PeriodicTrigger(ModelSaver(), every_k_epochs=5),
# linear warmup
ScheduledHyperParamSetter('learning_rate',
[(0, 0.003), (warmup_epoch, 0.01)], interp='linear'),
# step decay
ScheduledHyperParamSetter('learning_rate',
[(warmup_epoch, 0.01), ((120000//stepnum) + warmup_epoch, 1e-3), (180000//stepnum, 1e-4)]),
HumanHyperParamSetter('learning_rate'),
EvalCallback(),
GPUUtilizationTracker(),
],
steps_per_epoch=stepnum,
max_epoch=205000//stepnum,
session_init=get_model_loader(args.load),
nr_tower=nr_gpu
)
SyncMultiGPUTrainerReplicated(cfg, gpu_prefetch=False).train()
# Some third-party helper functions
+ generate_anchors.py: copied from [py-faster-rcnn](https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/generate_anchors.py).
+ box_ops.py: modified from [TF object detection API](https://github.com/tensorflow/models/blob/master/object_detection/core/box_list_ops.py).
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: box_ops.py
import tensorflow as tf
from tensorpack.tfutils.scope_utils import under_name_scope
from tensorpack.tfutils import get_default_sess_config
from tensorpack.utils.argtools import memoized
"""
This file is modified from
https://github.com/tensorflow/models/blob/master/object_detection/core/box_list_ops.py
"""
@under_name_scope()
def area(boxes):
"""
Args:
boxes: nx4 floatbox
Returns:
n
"""
x_min, y_min, x_max, y_max = tf.split(boxes, 4, axis=1)
return tf.squeeze((y_max - y_min) * (x_max - x_min), [1])
@under_name_scope()
def pairwise_intersection(boxlist1, boxlist2):
"""Compute pairwise intersection areas between boxes.
Args:
boxlist1: Nx4 floatbox
boxlist2: Mx4
Returns:
a tensor with shape [N, M] representing pairwise intersections
"""
x_min1, y_min1, x_max1, y_max1 = tf.split(boxlist1, 4, axis=1)
x_min2, y_min2, x_max2, y_max2 = tf.split(boxlist2, 4, axis=1)
all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(y_max2))
all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(y_min2))
intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin)
all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(x_max2))
all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(x_min2))
intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin)
return intersect_heights * intersect_widths
@under_name_scope()
def pairwise_iou(boxlist1, boxlist2):
"""Computes pairwise intersection-over-union between box collections.
Args:
boxlist1: Nx4 floatbox
boxlist2: Mx4
Returns:
a tensor with shape [N, M] representing pairwise iou scores.
"""
intersections = pairwise_intersection(boxlist1, boxlist2)
areas1 = area(boxlist1)
areas2 = area(boxlist2)
unions = (
tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections)
return tf.where(
tf.equal(intersections, 0.0),
tf.zeros_like(intersections), tf.truediv(intersections, unions))
@memoized
def get_iou_callable():
"""
Get a pairwise box iou callable.
"""
with tf.device('/cpu:0'):
A = tf.placeholder(tf.float32, shape=[None, 4])
B = tf.placeholder(tf.float32, shape=[None, 4])
iou = pairwise_iou(A, B)
sess = tf.Session(config=get_default_sess_config())
return sess.make_callable(iou, [A, B])
# https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/generate_anchors.py
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Sean Bell
# --------------------------------------------------------
from six.moves import range
import numpy as np
# Verify that we compute the same anchors as Shaoqing's matlab implementation:
#
# >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat
# >> anchors
#
# anchors =
#
# -83 -39 100 56
# -175 -87 192 104
# -359 -183 376 200
# -55 -55 72 72
# -119 -119 136 136
# -247 -247 264 264
# -35 -79 52 96
# -79 -167 96 184
# -167 -343 184 360
#array([[ -83., -39., 100., 56.],
# [-175., -87., 192., 104.],
# [-359., -183., 376., 200.],
# [ -55., -55., 72., 72.],
# [-119., -119., 136., 136.],
# [-247., -247., 264., 264.],
# [ -35., -79., 52., 96.],
# [ -79., -167., 96., 184.],
# [-167., -343., 184., 360.]])
def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
scales=2**np.arange(3, 6)):
"""
Generate anchor (reference) windows by enumerating aspect ratios X
scales wrt a reference (0, 0, 15, 15) window.
"""
base_anchor = np.array([1, 1, base_size, base_size], dtype='float32') - 1
ratio_anchors = _ratio_enum(base_anchor, ratios)
anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
for i in range(ratio_anchors.shape[0])])
return anchors
def _whctrs(anchor):
"""
Return width, height, x center, and y center for an anchor (window).
"""
w = anchor[2] - anchor[0] + 1
h = anchor[3] - anchor[1] + 1
x_ctr = anchor[0] + 0.5 * (w - 1)
y_ctr = anchor[1] + 0.5 * (h - 1)
return w, h, x_ctr, y_ctr
def _mkanchors(ws, hs, x_ctr, y_ctr):
"""
Given a vector of widths (ws) and heights (hs) around a center
(x_ctr, y_ctr), output a set of anchors (windows).
"""
ws = ws[:, np.newaxis]
hs = hs[:, np.newaxis]
anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
y_ctr - 0.5 * (hs - 1),
x_ctr + 0.5 * (ws - 1),
y_ctr + 0.5 * (hs - 1)))
return anchors
def _ratio_enum(anchor, ratios):
"""
Enumerate a set of anchors for each aspect ratio wrt an anchor.
"""
w, h, x_ctr, y_ctr = _whctrs(anchor)
size = w * h
size_ratios = size / ratios
ws = np.round(np.sqrt(size_ratios))
hs = np.round(ws * ratios)
anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
return anchors
def _scale_enum(anchor, scales):
"""
Enumerate a set of anchors for each scale wrt an anchor.
"""
w, h, x_ctr, y_ctr = _whctrs(anchor)
ws = w * scales
hs = h * scales
anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
return anchors
if __name__ == '__main__':
#import time
#t = time.time()
#a = generate_anchors()
#print(time.time() - t)
#print(a)
#from IPython import embed; embed()
print(generate_anchors(
16, scales=np.asarray((2, 4, 8, 16, 32), 'float32'),
ratios=[0.5,1,2]))
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: viz.py
from six.moves import zip
import numpy as np
from tensorpack.utils import viz
from coco import COCOMeta
from utils.box_ops import get_iou_callable
def draw_annotation(img, boxes, klass, is_crowd=None):
labels = []
assert len(boxes) == len(klass)
if is_crowd is not None:
assert len(boxes) == len(is_crowd)
for cls, crd in zip(klass, is_crowd):
clsname = COCOMeta.class_names[cls]
if crd == 1:
clsname += ';Crowd'
labels.append(clsname)
else:
for cls in klass:
labels.append(COCOMeta.class_names[cls])
img = viz.draw_boxes(img, boxes, labels)
return img
def draw_proposal_recall(img, proposals, proposal_scores, gt_boxes):
"""
Draw top3 proposals for each gt.
Args:
proposals: NPx4
proposal_scores: NP
gt_boxes: NG
"""
bbox_iou_float = get_iou_callable()
box_ious = bbox_iou_float(gt_boxes, proposals) #ng x np
box_ious_argsort = np.argsort(-box_ious, axis=1)
good_proposals_ind = box_ious_argsort[:,:3] # for each gt, find 3 best proposals
good_proposals_ind = np.unique(good_proposals_ind.ravel())
proposals = proposals[good_proposals_ind,:]
tags = list(map(str, proposal_scores[good_proposals_ind]))
img = viz.draw_boxes(img, proposals, tags)
return img, good_proposals_ind
def draw_predictions(img, boxes, scores):
"""
Args:
boxes: kx4
scores: kxC
"""
if len(boxes) == 0:
return img
labels = scores.argmax(axis=1)
scores = scores.max(axis=1)
tags = ["{},{:.2f}".format(COCOMeta.class_names[lb], score) for lb, score in zip(labels, scores)]
return viz.draw_boxes(img, boxes, tags)
def draw_final_outputs(img, results):
"""
Args:
results: [DetectionResult]
"""
all_boxes = []
all_tags = []
for class_id, boxes, scores in results:
all_boxes.extend(boxes)
all_tags.extend(
["{},{:.2f}".format(COCOMeta.class_names[class_id], sc) for sc in scores])
all_boxes = np.asarray(all_boxes)
if all_boxes.shape[0] == 0:
return img
return viz.draw_boxes(img, all_boxes, all_tags)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment