initial commit of FasterRCNN

23446308 · Yuxin Wu · 7c3b404a · 23446308 · 23446308 · 23446308
Commit 23446308 authored Oct 11, 2017 by Yuxin Wu
14 changed files
--- a/examples/FasterRCNN/README.md
+++ b/examples/FasterRCNN/README.md
+# Faster-RCNN on COCO
+This example aimes to provide a minimal Multi-GPU implementation (<1000 lines) of ResNet50-Faster-RCNN on COCO.
+
+## Dependencies
+ TensorFlow nightly.
+ Install [pycocotools](https://github.com/pdollar/coco/tree/master/PythonAPI/pycocotools), OpenCV.
+ Pre-trained [ResNet50 model](https://goo.gl/6XjK9V) from tensorpack model zoo.
+ COCO data. It assumes the following directory structure:
+```
+DIR/
+  annotations/
+    instances_train2014.json
+    instances_val2014.json
+    instances_minival2014.json
+    instances_valminusminival2014.json
+  train2014/
+    COCO_train2014_*.jpg
+  val2014/
+    COCO_val2014_*.jpg
+```
+`minival` and `valminusminival` are optional. You can download them
+[here](https://github.com/rbgirshick/py-faster-rcnn/blob/master/data/README.md).
+
+
+## Usage
+Change `BASEDIR` in `config.py` to `/path/to/DIR` as described above.
+
+To train:
+```
+./train.py --load /path/to/ImageNet-ResNet50.npz
+```
+The code is written for training with __8 GPUs__. Otherwise the performance won't be as good.
+
+To predict on an image (and show output in a window):
+```
+./train.py --predict input.jpg
+```
+
+## Results
+
+ trainval35k/minival, FASTRCNN_BATCH=256: 32.9
+ trainval35k/minival, FASTRCNN_BATCH=64: 31.7. Takes less than one day on 8 Maxwell TitanX.
+
+The hyperparameters are not carefully tuned. You can probably get better performance by e.g.  training longer.
+
+## Files
+This is an minimal implementation that simply contains these files:
+ coco.py: load COCO data
+ data.py: prepare data for training
+ common.py: some common data preparation utilities
+ basemodel.py: implement resnet
+ model.py: implement faster-rcnn
+ viz.py: visualization utilities
+ utils/: third-party helper functions
+ train.py: main training script
+ eval.py: utilities for evaluation
--- a/examples/FasterRCNN/basemodel.py
+++ b/examples/FasterRCNN/basemodel.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# File: basemodel.py
+
+import tensorflow as tf
+from tensorflow.contrib.layers import variance_scaling_initializer
+import tensorpack.tfutils.symbolic_functions as symbf
+from tensorpack.tfutils.summary import add_moving_summary, add_activation_summary
+from tensorpack.tfutils.argscope import argscope, get_arg_scope
+from tensorpack.models import (
+    Conv2D, MaxPooling, BatchNorm, BNReLU, GlobalAvgPooling, FullyConnected)
+
+
+def image_preprocess(image, bgr=True):
+    with tf.name_scope('image_preprocess'):
+        if image.dtype.base_dtype != tf.float32:
+            image = tf.cast(image, tf.float32)
+        image = image * (1.0 / 255)
+
+        mean = [0.485, 0.456, 0.406]    # rgb
+        std = [0.229, 0.224, 0.225]
+        if bgr:
+            mean = mean[::-1]
+            std = std[::-1]
+        image_mean = tf.constant(mean, dtype=tf.float32)
+        image_std = tf.constant(std, dtype=tf.float32)
+        image = (image - image_mean) / image_std
+        return image
+
+def get_bn(zero_init=False):
+    if zero_init:
+        return lambda x, name: BatchNorm('bn', x, gamma_init=tf.zeros_initializer())
+    else:
+        return lambda x, name: BatchNorm('bn', x)
+
+def resnet_shortcut(l, n_out, stride, nl=tf.identity):
+    data_format = get_arg_scope()['Conv2D']['data_format']
+    n_in = l.get_shape().as_list()[1 if data_format == 'NCHW' else 3]
+    if n_in != n_out:   # change dimension when channel is not the same
+        if stride == 2 and 'group3' not in tf.get_variable_scope().name:
+            l = l[:,:,:-1,:-1]
+            return Conv2D('convshortcut', l, n_out, 1,
+                          stride=stride, padding='VALID', nl=nl)
+        else:
+            return Conv2D('convshortcut', l, n_out, 1,
+                          stride=stride, nl=nl)
+    else:
+        return l
+
+
+def resnet_bottleneck(l, ch_out, stride):
+    l, shortcut = l, l
+    l = Conv2D('conv1', l, ch_out, 1, nl=BNReLU)
+    if stride == 2 and 'group3' not in tf.get_variable_scope().name:
+        l = tf.pad(l, [[0,0],[0,0],[0,1],[0,1]])
+        l = Conv2D('conv2', l, ch_out, 3, stride=2, nl=BNReLU, padding='VALID')
+    else:
+        l = Conv2D('conv2', l, ch_out, 3, stride=stride, nl=BNReLU)
+    l = Conv2D('conv3', l, ch_out * 4, 1, nl=get_bn(zero_init=True))
+    return l + resnet_shortcut(shortcut, ch_out * 4, stride, nl=get_bn(zero_init=False))
+
+
+def resnet_group(l, name, block_func, features, count, stride):
+    with tf.variable_scope(name):
+        for i in range(0, count):
+            with tf.variable_scope('block{}'.format(i)):
+                l = block_func(l, features,
+                               stride if i == 0 else 1)
+                # end of each block need an activation
+                l = tf.nn.relu(l)
+    return l
+
+def pretrained_resnet_conv4(image, num_blocks):
+    assert len(num_blocks) == 3
+    with argscope([Conv2D, MaxPooling, BatchNorm], data_format='NCHW'), \
+            argscope(Conv2D, nl=tf.identity, use_bias=False), \
+            argscope(BatchNorm, use_local_stat=False):
+        l = tf.pad(image, [[0,0],[0,0],[2,3],[2,3]])
+        l = Conv2D('conv0', l, 64, 7, stride=2, nl=BNReLU, padding='VALID')
+        l = tf.pad(l, [[0,0],[0,0],[0,1],[0,1]])
+        l = MaxPooling('pool0', l, shape=3, stride=2, padding='VALID')
+        l = resnet_group(l, 'group0', resnet_bottleneck, 64, num_blocks[0], 1)
+        # TODO replace var by const to enable folding
+        l = tf.stop_gradient(l)
+        l = resnet_group(l, 'group1', resnet_bottleneck, 128, num_blocks[1], 2)
+        l = resnet_group(l, 'group2', resnet_bottleneck, 256, num_blocks[2], 2)
+    # 16x downsampling up to now
+    return l
+
+
+def resnet_conv5(image):
+    with argscope([Conv2D, GlobalAvgPooling, BatchNorm], data_format='NCHW'), \
+            argscope(Conv2D, nl=tf.identity, use_bias=False), \
+            argscope(BatchNorm, use_local_stat=False):
+        # 14x14:
+        l = resnet_group(image, 'group3', resnet_bottleneck, 512, 3, stride=2)
+        l = GlobalAvgPooling('gap', l)
+        return l
--- a/examples/FasterRCNN/coco.py
+++ b/examples/FasterRCNN/coco.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# File: coco.py
+
+import numpy as np
+import os
+import six
+from termcolor import colored
+from tabulate import tabulate
+
+from tensorpack.dataflow import DataFromList
+from tensorpack.utils import logger
+from tensorpack.utils.rect import FloatBox
+from tensorpack.utils.timer import timed_operation
+from pycocotools.coco import COCO
+
+__all__ = ['COCODetection', 'COCOMeta']
+
+COCO_NUM_CATEGORY = 80
+
+class _COCOMeta(object):
+    INSTANCE_TO_BASEDIR = {
+        'train2014': 'train2014',
+        'val2014': 'val2014',
+        'valminusminival2014': 'val2014',
+        'minival2014': 'val2014',
+        'test2014': 'test2014'
+    }
+
+    def valid(self):
+        return hasattr(self, 'cat_names')
+
+    def create(self, cat_ids, cat_names):
+        """
+        cat_ids: list of ids
+        cat_names: list of names
+        """
+        assert not self.valid()
+        assert len(cat_ids) == COCO_NUM_CATEGORY and len(cat_names) == COCO_NUM_CATEGORY
+        self.cat_names = cat_names
+        self.class_names = ['BG'] + self.cat_names
+
+        # background has class id of 0
+        self.category_id_to_class_id = {
+            v: i + 1 for i, v in enumerate(cat_ids)}
+        self.class_id_to_category_id = {
+            v: k for k, v in self.category_id_to_class_id.items()}
+
+COCOMeta = _COCOMeta()
+
+class COCODetection(object):
+    def __init__(self, basedir, name):
+        assert name in COCOMeta.INSTANCE_TO_BASEDIR.keys(), name
+        self.name = name
+        self._imgdir = os.path.join(basedir, COCOMeta.INSTANCE_TO_BASEDIR[name])
+        assert os.path.isdir(self._imgdir), self._imgdir
+        annotation_file = os.path.join(
+            basedir, 'annotations/instances_{}.json'.format(name))
+        assert os.path.isfile(annotation_file), annotation_file
+
+        self.coco = COCO(annotation_file)
+
+        # initialize the meta
+        cat_ids = self.coco.getCatIds()
+        cat_names = [c['name'] for c in self.coco.loadCats(cat_ids)]
+        if not COCOMeta.valid():
+            COCOMeta.create(cat_ids, cat_names)
+        else:
+            assert COCOMeta.cat_names == cat_names
+
+        logger.info("Instances loaded from {}.".format(annotation_file))
+
+    def load(self, add_gt=True):
+        """
+        Args:
+            add_gt: whether to add ground truth annotations to the dicts
+        Returns:
+            a list of dict, each has keys including:
+                height, width, id, file_name,
+                and (if add_gt is True) boxes, class, is_crowd
+        """
+        with timed_operation('Load Groundtruth Boxes for {}'.format(self.name)):
+            img_ids = self.coco.getImgIds()
+            img_ids.sort()
+            # list of dict, each has keys: height,width,id,file_name
+            imgs = self.coco.loadImgs(img_ids)
+
+            for img in imgs:
+                self._use_absolute_file_name(img)
+                if add_gt:
+                    self._add_detection_gt(img)
+            return imgs
+
+    def _use_absolute_file_name(self, img):
+        """
+        Change relative filename to abosolute file name.
+        """
+        img['file_name'] = os.path.join(
+            self._imgdir, img['file_name'])
+        assert os.path.isfile(img['file_name']), img['file_name']
+
+    def _add_detection_gt(self, img):
+        """
+        Add 'boxes', 'class', 'is_crowd' of this image to the dict, used by detection.
+        """
+        ann_ids = self.coco.getAnnIds(imgIds=img['id'], iscrowd=None)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # clean-up boxes
+        valid_objs = []
+        width = img['width']
+        height = img['height']
+        for obj in objs:
+            if obj.get('ignore', 0) == 1:
+                continue
+            x1, y1, w, h = obj['bbox']
+            # bbox is originally in float
+            # NOTE: assume in data that x1/y1 means upper-left corner and w/h means true w/h
+            # assume that (0.0, 0.0) is upper-left corner of the first pixel
+            box = FloatBox(float(x1), float(y1),
+                           float(x1 + w), float(y1 + h))
+            box.clip_by_shape([height, width])
+            # Require non-zero seg area and more than 1x1 box size
+            if obj['area'] > 0 and box.is_box() and box.area() >= 4:
+                obj['bbox'] = [box.x1, box.y1, box.x2, box.y2]
+                valid_objs.append(obj)
+
+        # all geometrically-valid boxes are returned
+        boxes = np.asarray([obj['bbox'] for obj in valid_objs], dtype='float32') # (n, 4)
+        cls = np.asarray([
+            COCOMeta.category_id_to_class_id[obj['category_id']]
+            for obj in valid_objs], dtype='int32')  # (n,)
+        is_crowd = np.asarray([obj['iscrowd'] for obj in valid_objs], dtype='int8')
+
+        # add the keys
+        img['boxes'] = boxes        # nx4
+        img['class'] = cls          # n, always >0
+        img['is_crowd'] = is_crowd  # n,
+
+    def print_class_histogram(self, imgs):
+        nr_class = len(COCOMeta.class_names)
+        hist_bins = np.arange(nr_class + 1)
+
+        # Histogram of ground-truth objects
+        gt_hist = np.zeros((nr_class,), dtype=np.int)
+        for entry in imgs:
+            # filter crowd?
+            gt_inds = np.where(
+                (entry['class'] > 0) & (entry['is_crowd'] == 0))[0]
+            gt_classes = entry['class'][gt_inds]
+            gt_hist += np.histogram(gt_classes, bins=hist_bins)[0]
+        data = [[COCOMeta.class_names[i], v] for i, v in enumerate(gt_hist)]
+        data.append(['total', sum([x[1] for x in data])])
+        table = tabulate(data, headers=['class', '#box'], tablefmt='pipe')
+        logger.info("Ground-Truth Boxes:\n" + colored(table, 'cyan'))
+
+    @staticmethod
+    def load_many(basedir, names, add_gt=True):
+        """
+        Load and merges several instance files together.
+        """
+        if not isinstance(names, (list, tuple)):
+            names = [names]
+        ret = []
+        for n in names:
+            coco = COCODetection(basedir, n)
+            ret.extend(coco.load(add_gt))
+        return ret
+
+
+if __name__ == '__main__':
+    c = COCODetection('train')
+    gt_boxes = c.load()
+    print("#Images:", len(gt_boxes))
+    c.print_class_histogram(bb)
--- a/examples/FasterRCNN/common.py
+++ b/examples/FasterRCNN/common.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# File: common.py
+
+import numpy as np
+import cv2
+from tensorpack.dataflow import RNGDataFlow
+from tensorpack.dataflow.imgaug import transform
+from tensorpack.utils import logger
+
+import config
+
+class DataFromListOfDict(RNGDataFlow):
+    def __init__(self, lst, keys, shuffle=False):
+        self._lst = lst
+        self._keys = keys
+        self._shuffle = shuffle
+        self._size = len(lst)
+
+    def size(self):
+        return self._size
+
+    def get_data(self):
+        if self._shuffle:
+            self.rng.shuffle(self._lst)
+        for dic in self._lst:
+            dp = [dic[k] for k in self._keys]
+            yield dp
+
+
+class CustomResize(transform.TransformAugmentorBase):
+    """
+    Try resizing the shortest edge to a certain number
+    while avoiding the longest edge to exceed max_size.
+    """
+
+    def __init__(self, size, max_size, interp=cv2.INTER_LINEAR):
+        """
+        Args:
+            size (int): the size to resize the shortest edge to.
+            max_size (int): maximum allowed longest edge.
+        """
+        self._init(locals())
+
+    def _get_augment_params(self, img):
+        h, w = img.shape[:2]
+        scale = self.size * 1.0 / min(h, w)
+        if h < w:
+            newh, neww = self.size, scale * w
+        else:
+            newh, neww = scale * h, self.size
+        if max(newh, neww) > self.max_size:
+            scale = self.max_size * 1.0 / max(newh, neww)
+            newh = newh * scale
+            neww = neww * scale
+        neww = int(neww + 0.5)
+        newh = int(newh + 0.5)
+        return transform.ResizeTransform(h, w, newh, neww, self.interp)
+
+
+def box_to_point8(boxes):
+    """
+    Args:
+        boxes: nx4
+
+    Returns:
+        (nx4)x2
+    """
+    b = boxes[:,[0,1,2,3,0,3,2,1]]
+    b = b.reshape((-1, 2))
+    return b
+
+def point8_to_box(points):
+    """
+    Args:
+        points: (nx4)x2
+    Returns:
+        nx4 boxes (x1y1x2y2)
+    """
+    p = points.reshape((-1, 4, 2))
+    minxy = p.min(axis=1)   #nx2
+    maxxy = p.max(axis=1)   #nx2
+    return np.concatenate((minxy, maxxy), axis=1)
+
+
+def clip_boxes(boxes, shape):
+    """
+    Args:
+        boxes: nx4, float
+        shape: h, w
+    """
+    h, w = shape
+    boxes[:,[0,1]] = np.maximum(boxes[:,[0,1]], 0)
+    boxes[:,2] = np.minimum(boxes[:,2], w)
+    boxes[:,3] = np.minimum(boxes[:,3], h)
+    return boxes
+
+
+def print_config():
+    logger.info("Config: ------------------------------------------")
+    for k in dir(config):
+        if k == k.upper():
+            logger.info("{} = {}".format(k, getattr(config, k)))
+    logger.info("--------------------------------------------------")
--- a/examples/FasterRCNN/config.py
+++ b/examples/FasterRCNN/config.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# File: config.py
+
+import numpy as np
+
+# dataset -----------------------
+BASEDIR = '/path/to/your/COCO/DIR'
+TRAIN_DATASET = ['train2014', 'valminusminival2014']
+VAL_DATASET = 'minival2014'   # only support evaluation on one dataset
+NUM_CLASS = 81
+
+# preprocessing --------------------
+SHORT_EDGE_SIZE = 600
+MAX_SIZE = 1024
+
+# anchors -------------------------
+ANCHOR_STRIDE = 16
+# sqrtarea of the anchor box
+ANCHOR_SIZES = (32, 64, 128, 256, 512)
+ANCHOR_RATIOS = (0.5, 1., 2.)
+NR_ANCHOR = len(ANCHOR_SIZES) * len(ANCHOR_RATIOS)
+POSITIVE_ANCHOR_THRES = 0.7
+NEGATIVE_ANCHOR_THRES = 0.3
+
+# rpn training -------------------------
+# keep fg ratio in a batch in this range
+RPN_FG_RATIO = 0.5
+RPN_BATCH_PER_IM = 256
+RPN_MIN_SIZE = 0
+RPN_PROPOSAL_NMS_THRESH = 0.7
+TRAIN_PRE_NMS_TOPK = 12000
+TRAIN_POST_NMS_TOPK = 2000
+
+# boxes overlapping crowd will be ignored.
+CROWD_OVERLAP_THRES = 0.7
+
+# fastrcnn training ---------------------
+FASTRCNN_BATCH_PER_IM = 64
+FASTRCNN_BBOX_REG_WEIGHTS = np.array([10, 10, 5, 5], dtype='float32')
+FASTRCNN_FG_THRESH = 0.5
+# keep fg ratio in a batch in this range
+FASTRCNN_FG_RATIO = (0.1, 0.25)
+
+# testing -----------------------
+TEST_PRE_NMS_TOPK= 6000
+TEST_POST_NMS_TOPK= 1000
+FASTRCNN_NMS_THRESH = 0.5
+RESULT_SCORE_THRESH = 0.05
+RESULTS_PER_IM = 100
--- a/examples/FasterRCNN/data.py
+++ b/examples/FasterRCNN/data.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# File: data.py
+
+import cv2
+import os
+import numpy as np
+import logging
+
+from tensorpack.utils import logger
+from tensorpack.utils.argtools import memoized, log_once
+from tensorpack.dataflow import (
+    ProxyDataFlow, MapData, imgaug, TestDataSpeed,
+    AugmentImageComponents, MapDataComponent)
+import tensorpack.utils.viz as tpviz
+from tensorpack.utils.viz import interactive_imshow
+
+from coco import COCODetection
+from utils.generate_anchors import generate_anchors
+from utils.box_ops import get_iou_callable
+from common import (
+    DataFromListOfDict, CustomResize,
+    box_to_point8, point8_to_box)
+import config
+
+class MalformedData(BaseException):
+    pass
+
+@memoized
+def get_all_anchors():
+    """
+    Get all anchors in the largest possible image, shifted, floatbox
+
+    Returns:
+        anchors: SxSxNR_ANCHORx4, where S == MAX_SIZE//STRIDE, floatbox
+    """
+    # Generates a NAx4 matrix of anchor boxes in (x1, y1, x2, y2) format. Anchors
+    # are centered on stride / 2, have (approximate) sqrt areas of the specified
+    # sizes, and aspect ratios as given.
+    cell_anchors = generate_anchors(
+        config.ANCHOR_STRIDE,
+        scales=np.array(config.ANCHOR_SIZES, dtype=np.float) / config.ANCHOR_STRIDE,
+        ratios=np.array(config.ANCHOR_RATIOS, dtype=np.float))
+    # anchors are intbox here.
+    # anchors at featuremap [0,0] are centered at fpcoor (8,8) (half of stride)
+
+    field_size = config.MAX_SIZE // config.ANCHOR_STRIDE
+    shifts = np.arange(0, field_size) * config.ANCHOR_STRIDE
+    shift_x, shift_y = np.meshgrid(shifts, shifts)
+    shift_x = shift_x.flatten()
+    shift_y = shift_y.flatten()
+    shifts = np.vstack((shift_x, shift_y, shift_x, shift_y)).transpose()
+    # Kx4, K = field_size * field_size
+    K = shifts.shape[0]
+
+    A = cell_anchors.shape[0]
+    field_of_anchors = (
+        cell_anchors.reshape((1, A, 4)) +
+        shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
+    field_of_anchors = field_of_anchors.reshape((field_size, field_size, A, 4))
+    # FSxFSxAx4
+    assert np.all(field_of_anchors == field_of_anchors.astype('int32'))
+    field_of_anchors = field_of_anchors.astype('float32')
+    field_of_anchors[:,:,:,[2,3]] += 1
+    return field_of_anchors
+
+
+def get_anchor_labels(anchors, gt_boxes, crowd_boxes):
+    """
+    Label each anchor as fg/bg/ignore.
+    Args:
+        anchors: Ax4 float
+        gt_boxes: Bx4 float
+        crowd_boxes: Cx4 float
+
+    Returns:
+        anchor_labels: (A,) int. Each element is {-1, 0, 1}
+        anchor_boxes: Ax4. Contains the target gt_box for each anchor when the anchor is fg.
+    """
+    # This function will modify labels and return the filtered inds
+    def filter_box_label(labels, value, max_num):
+        curr_inds = np.where(labels == value)[0]
+        if len(curr_inds) > max_num:
+            disable_inds = np.random.choice(
+                curr_inds, size=(len(curr_inds) - max_num),
+                replace=False)
+            labels[disable_inds] = -1    # ignore them
+            curr_inds = np.where(labels == value)[0]
+        return curr_inds
+
+    bbox_iou_float = get_iou_callable()
+    NA, NB = len(anchors), len(gt_boxes)
+    assert NB > 0  # empty images should have been filtered already
+    box_ious = bbox_iou_float(anchors, gt_boxes) # NA x NB
+    ious_argmax_per_anchor = box_ious.argmax(axis=1) # NA,
+    ious_max_per_anchor = box_ious.max(axis=1)
+    ious_max_per_gt = np.amax(box_ious, axis=0, keepdims=True) # 1xNB
+    # for each gt, find all those anchors (including ties) that has the max ious with it
+    anchors_with_max_iou_per_gt = np.where(box_ious == ious_max_per_gt)[0]
+
+    # Setting NA labels: 1--fg 0--bg -1--ignore
+    anchor_labels = -np.ones((NA,), dtype='int32')   # NA,
+
+    # the order of setting neg/pos labels matter
+    anchor_labels[anchors_with_max_iou_per_gt] = 1
+    anchor_labels[ious_max_per_anchor >= config.POSITIVE_ANCHOR_THRES] = 1
+    anchor_labels[ious_max_per_anchor < config.NEGATIVE_ANCHOR_THRES] = 0
+
+    # First label all non-ignore candidate boxes which overlap crowd as ignore
+    if crowd_boxes.size > 0:
+        cand_inds = np.where(anchor_labels >= 0)[0]
+        cand_anchors = anchors[cand_inds]
+        ious = bbox_iou_float(cand_anchors, crowd_boxes)
+        overlap_with_crowd = cand_inds[ious.max(axis=1) > config.CROWD_OVERLAP_THRES]
+        anchor_labels[overlap_with_crowd] = -1
+
+    # Filter fg labels: ignore some fg if fg is too many
+    old_num_fg = np.sum(anchor_labels == 1)
+    target_num_fg = int(config.RPN_BATCH_PER_IM * config.RPN_FG_RATIO)
+    fg_inds = filter_box_label(anchor_labels, 1, target_num_fg)
+    # Note that fg could be fewer than the target ratio
+
+    # filter bg labels. num_bg is not allowed to be too many
+    old_num_bg = np.sum(anchor_labels == 0)
+    if old_num_bg == 0 or len(fg_inds) == 0:
+        # No valid bg/fg in this image, skip.
+        # This can happen if, e.g. the image has large crowd.
+        raise MalformedData("No valid foreground/background for RPN!")
+    target_num_bg = config.RPN_BATCH_PER_IM - len(fg_inds)
+    bg_inds = filter_box_label(anchor_labels, 0, target_num_bg)
+
+    # Set anchor boxes: the best gt_box for each fg anchor
+    anchor_boxes = np.zeros((NA, 4), dtype='float32')
+    fg_boxes = gt_boxes[ious_argmax_per_anchor[fg_inds],:]
+    anchor_boxes[fg_inds, :] = fg_boxes
+    return anchor_labels, anchor_boxes
+
+def get_rpn_anchor_input(im, boxes, klass, is_crowd):
+    """
+    Args:
+        im: an image
+        boxes: nx4, floatbox, gt. shoudn't be changed
+        klass: n,
+        is_crowd: n,
+
+    Returns:
+        The anchor labels and target boxes for each pixel in the featuremap.
+        fm_labels: fHxfWxNA
+        fm_boxes: fHxfWxNAx4
+    """
+    boxes = boxes.copy()
+
+    ALL_ANCHORS = get_all_anchors()
+    H, W = im.shape[:2]
+    featureH, featureW = H // config.ANCHOR_STRIDE, W // config.ANCHOR_STRIDE
+
+    def filter_box_inside(im, boxes):
+        h, w = im.shape[:2]
+        indices = np.where(
+            (boxes[:,0] >= 0) &
+            (boxes[:,1] >= 0) &
+            (boxes[:,2] <= w) &
+            (boxes[:,3] <= h))[0]
+        return indices
+
+    crowd_boxes = boxes[is_crowd == 1]
+    non_crowd_boxes = boxes[is_crowd == 0]
+
+    # fHxfWxAx4
+    featuremap_anchors = ALL_ANCHORS[:featureH,:featureW,:,:]
+    featuremap_anchors_flatten = featuremap_anchors.reshape((-1, 4))
+    # only use anchors inside the image
+    inside_ind = filter_box_inside(im, featuremap_anchors_flatten)
+    inside_anchors = featuremap_anchors_flatten[inside_ind,:]
+
+    anchor_labels, anchor_boxes = get_anchor_labels(inside_anchors, non_crowd_boxes, crowd_boxes)
+
+    # Fill them back to original size: fHxfWx1, fHxfWx4
+    featuremap_labels = -np.ones((featureH * featureW * config.NR_ANCHOR, ), dtype='int32')
+    featuremap_labels[inside_ind] = anchor_labels
+    featuremap_labels = featuremap_labels.reshape((featureH, featureW, config.NR_ANCHOR))
+    featuremap_boxes = np.zeros((featureH * featureW * config.NR_ANCHOR, 4), dtype='float32')
+    featuremap_boxes[inside_ind, :] = anchor_boxes
+    featuremap_boxes = featuremap_boxes.reshape((featureH, featureW, config.NR_ANCHOR, 4))
+    return featuremap_labels, featuremap_boxes
+
+
+def read_and_augment_images(ds):
+    def mapf(dp):
+        fname = dp[0]
+        im = cv2.imread(fname, cv2.IMREAD_COLOR).astype('float32')
+        assert im is not None, dp[0]
+        dp[0] = im
+
+        # assume floatbox as input
+        assert dp[1].dtype == np.float32
+        dp[1] = box_to_point8(dp[1])
+
+        dp.append(fname)
+        return dp
+    ds = MapData(ds, mapf)
+
+    augs = [CustomResize(config.SHORT_EDGE_SIZE, config.MAX_SIZE),
+            imgaug.Flip(horiz=True)]
+    ds = AugmentImageComponents(ds, augs, index=(0,), coords_index=(1,))
+    def unmapf(points):
+        boxes = point8_to_box(points)
+        return boxes
+    ds = MapDataComponent(ds, unmapf, 1)
+    return ds
+
+
+def get_train_dataflow():
+    imgs = COCODetection.load_many(config.BASEDIR, config.TRAIN_DATASET)
+    # Valid training images should have at least one fg box.
+    # But this filter shall not be applied for testing.
+    imgs = list(filter(lambda img: len(img['boxes']) > 0, imgs))    # log invalid training
+
+    ds = DataFromListOfDict(
+        imgs,
+        ['file_name', 'boxes', 'class', 'is_crowd'],  # we need this four keys only
+        shuffle=True)
+    ds = read_and_augment_images(ds)
+
+    def add_anchor_to_dp(dp):
+        im, boxes, klass, is_crowd, fname = dp
+        try:
+            fm_labels, fm_boxes = get_rpn_anchor_input(im, boxes, klass, is_crowd)
+
+            boxes = boxes[is_crowd == 0]    # skip crowd boxes in training target
+            klass = klass[is_crowd == 0]
+
+            if not len(boxes):
+                raise MalformedData("No valid gt_boxes!")
+        except MalformedData as e:
+            log_once("Input {} is invalid for training: {}".format(fname, str(e)), 'warn')
+            return None
+
+        return [im, fm_labels, fm_boxes, boxes, klass]
+
+    ds = MapData(ds, add_anchor_to_dp)
+    return ds
+
+def get_eval_dataflow():
+    imgs = COCODetection.load_many(config.BASEDIR, config.VAL_DATASET, add_gt=False)
+# no filter for training
+    ds = DataFromListOfDict(imgs, ['file_name', 'id'])
+    def f(fname):
+        im = cv2.imread(fname, cv2.IMREAD_COLOR)
+        assert im is not None, fname
+        return im
+    ds = MapDataComponent(ds, f, 0)
+    return ds
+
+if __name__ == '__main__':
+    #logger.setLevel(logging.DEBUG)
+    from tensorpack.dataflow import PrintData
+    ds = get_train_dataflow('/datasets01/COCO/060817')
+    ds = PrintData(ds, 100)
+    TestDataSpeed(ds, 50000).start()
+    ds.reset_state()
+    for k in ds.get_data():
+        pass
+        #import IPython as IP; IP.embed()
+
+
--- a/examples/FasterRCNN/eval.py
+++ b/examples/FasterRCNN/eval.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# File: eval.py
+
+import numpy as np
+import tqdm
+import cv2
+import os
+from collections import namedtuple
+
+import tensorflow as tf
+from tensorpack.dataflow import MapDataComponent, TestDataSpeed
+from tensorpack.tfutils import get_default_sess_config
+from tensorpack.utils.argtools import memoized
+from tensorpack.utils.utils import get_tqdm_kwargs
+
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+from coco import COCODetection, COCOMeta
+from common import clip_boxes, DataFromListOfDict, CustomResize
+import config
+
+DetectionResult = namedtuple(
+    'DetectionResult',
+    ['class_id', 'boxes', 'scores'])
+
+@memoized
+def get_tf_nms():
+    """
+    Get a NMS callable.
+    """
+    boxes = tf.placeholder(tf.float32, shape=[None, 4])
+    scores = tf.placeholder(tf.float32, shape=[None])
+    indices = tf.image.non_max_suppression(
+        boxes, scores,
+        config.RESULTS_PER_IM, config.FASTRCNN_NMS_THRESH)
+    sess = tf.Session(config=get_default_sess_config())
+    return sess.make_callable(indices, [boxes, scores])
+
+
+def nms_fastrcnn_results(boxes, probs):
+    """
+    Args:
+        boxes: nx4 floatbox in float32
+        probs: nxC
+
+    Returns:
+        [DetectionResult]
+    """
+    C = probs.shape[1]
+    boxes = boxes.copy()
+
+    boxes_per_class = {}
+    nms_func = get_tf_nms()
+    ret = []
+    for klass in range(1, C):
+        ids = np.where(probs[:, klass] > config.RESULT_SCORE_THRESH)[0]
+        if ids.size == 0:
+            continue
+        probs_k = probs[ids, klass].flatten()
+        boxes_k = boxes[ids,:]
+        selected_ids = nms_func(boxes_k[:,[1,0,3,2]], probs_k)
+        selected_boxes = boxes_k[selected_ids, :].copy()
+        ret.append(DetectionResult(klass, selected_boxes, probs_k[selected_ids]))
+
+    if len(ret):
+        newret = []
+        all_scores = np.hstack([x.scores for x in ret])
+        if len(all_scores) > config.RESULTS_PER_IM:
+            score_thresh = np.sort(all_scores)[-config.RESULTS_PER_IM]
+            for klass, boxes, scores in ret:
+                keep_ids = np.where(scores >= score_thresh)[0]
+                if len(keep_ids):
+                    newret.append(DetectionResult(
+                        klass, boxes[keep_ids,:], scores[keep_ids]))
+            ret = newret
+    return ret
+
+
+def detect_one_image(img, model_func):
+    """
+    Run detection on one image, using the TF callable.
+    This function should handle the preprocessing internally.
+
+    Args:
+        img: an image
+        model_func: a callable from TF model, takes [image] and returns (probs, boxes)
+
+    Returns:
+        [DetectionResult]
+    """
+    resizer = CustomResize(config.SHORT_EDGE_SIZE, config.MAX_SIZE)
+    resized_img = resizer.augment(img)
+    scale = (resized_img.shape[0] * 1.0 / img.shape[0] + resized_img.shape[1] * 1.0 / img.shape[1]) / 2
+    fg_probs, fg_boxes = model_func([resized_img])
+    fg_boxes = fg_boxes / scale
+    fg_boxes = clip_boxes(fg_boxes, img.shape[:2])
+    return nms_fastrcnn_results(fg_boxes, fg_probs)
+
+
+def eval_on_dataflow(df, detect_func):
+    """
+    Args:
+        df: a DataFlow which produces (image, image_id)
+        detect_func: a callable, takes [image] and returns a dict
+
+    Returns:
+        list of dict, to be dumped to COCO json format
+    """
+    df.reset_state()
+    all_results = []
+    with tqdm.tqdm(total=df.size(), **get_tqdm_kwargs()) as pbar:
+        for img, img_id in df.get_data():
+            results = detect_func(img)
+            for classid, boxes, scores in results:
+                cat_id = COCOMeta.class_id_to_category_id[classid]
+                boxes[:,2] -= boxes[:,0]
+                boxes[:,3] -= boxes[:,1]
+                for box, score in zip(boxes, scores):
+                    all_results.append({
+                        'image_id': img_id,
+                        'category_id': cat_id,
+                        'bbox': list(map(lambda x: float(round(x, 1)), box)),
+                        'score': float(round(score, 2)),
+                    })
+            pbar.update(1)
+    return all_results
+
+
+# https://github.com/pdollar/coco/blob/master/PythonAPI/pycocoEvalDemo.ipynb
+def print_evaluation_scores(json_file):
+    assert config.BASEDIR and os.path.isdir(config.BASEDIR)
+    annofile = os.path.join(
+        config.BASEDIR, 'annotations',
+        'instances_{}.json'.format(config.VAL_DATASET))
+    coco = COCO(annofile)
+    cocoDt = coco.loadRes(json_file)
+    imgIds = sorted(coco.getImgIds())
+    cocoEval = COCOeval(coco, cocoDt, 'bbox')
+    cocoEval.params.imgIds  = imgIds
+    cocoEval.evaluate()
+    cocoEval.accumulate()
+    cocoEval.summarize()
+
+
+if __name__ == '__main__':
+    ds = get_eval_dataflow('/home/yuxinwu/data/COCO/')
+    print("Size: ", ds.size())
+    TestDataSpeed(ds, 1000).start()
--- a/examples/FasterRCNN/model.py
+++ b/examples/FasterRCNN/model.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# File: model.py
+
+import numpy as np
+import tensorflow as tf
+from tensorpack.tfutils import get_current_tower_context
+from tensorpack.tfutils.summary import add_moving_summary
+from tensorpack.tfutils.argscope import argscope
+from tensorpack.tfutils.scope_utils import under_name_scope
+from tensorpack.models import Conv2D, FullyConnected
+
+from utils.box_ops import pairwise_iou
+import config
+
+def rpn_head(featuremap):
+    with tf.variable_scope('rpn'), \
+            argscope(Conv2D, data_format='NCHW',
+                     W_init=tf.random_normal_initializer(stddev=0.01)):
+        hidden = Conv2D('conv0', featuremap, 1024, 3, nl=tf.nn.relu)
+
+        label_logits = Conv2D('class', hidden, config.NR_ANCHOR, 1)
+        box_logits = Conv2D('box', hidden, 4 * config.NR_ANCHOR, 1)
+        # 1, NA(*4), im/16, im/16 (NCHW)
+
+        label_logits = tf.transpose(label_logits, [0, 2, 3, 1]) # 1xfHxfWxNA
+        label_logits = tf.squeeze(label_logits, 0)  # fHxfWxNA
+
+        shp = tf.shape(box_logits)  # 1x(NAx4)xfHxfW
+        box_logits = tf.transpose(box_logits, [0, 2, 3, 1]) # 1xfHxfWx(NAx4)
+        box_logits = tf.reshape(box_logits, tf.stack([shp[2], shp[3], config.NR_ANCHOR, 4])) # fHxfWxNAx4
+    return label_logits, box_logits
+
+
+@under_name_scope()
+def rpn_losses(anchor_labels, anchor_boxes, label_logits, box_logits):
+    """
+    Args:
+        anchor_labels: fHxfWxNA
+        anchor_boxes: fHxfWxNAx4, encoded
+        label_logits:  fHxfWxNA
+        box_logits: fHxfWxNAx4
+
+    Returns:
+        label_loss, box_loss
+    """
+    with tf.device('/cpu:0'):
+        valid_mask = tf.stop_gradient(tf.not_equal(anchor_labels, -1))
+        pos_mask = tf.stop_gradient(tf.equal(anchor_labels, 1))
+        nr_valid = tf.stop_gradient(tf.count_nonzero(valid_mask), name='num_valid_anchor')
+        nr_pos = tf.count_nonzero(pos_mask, name='num_pos_anchor')
+
+        valid_anchor_labels = tf.boolean_mask(anchor_labels, valid_mask)
+    valid_label_logits = tf.boolean_mask(label_logits, valid_mask)
+
+    with tf.name_scope('label_metrics'):
+        valid_label_prob = tf.nn.sigmoid(valid_label_logits)
+        summaries = []
+        with tf.device('/cpu:0'):
+            for th in [0.5, 0.2, 0.1]:
+                valid_prediction = tf.cast(valid_label_prob > th, tf.int32)
+                prediction_corr = tf.count_nonzero(tf.equal(valid_prediction, valid_anchor_labels))
+                pos_prediction_corr = tf.count_nonzero(tf.logical_and(
+                       valid_label_prob > th,
+                       tf.equal(valid_prediction, valid_anchor_labels)))
+                summaries.append(tf.truediv(
+                        pos_prediction_corr,
+                        nr_pos, name='recall_th{}'.format(th)))
+                summaries.append(tf.truediv(
+                        prediction_corr,
+                        nr_valid, name='accuracy_th{}'.format(th)))
+
+    label_loss = tf.nn.sigmoid_cross_entropy_with_logits(
+        labels=tf.to_float(valid_anchor_labels), logits=valid_label_logits)
+    label_loss = tf.reduce_mean(label_loss, name='label_loss')
+
+    pos_anchor_boxes = tf.boolean_mask(anchor_boxes, pos_mask)
+    pos_box_logits = tf.boolean_mask(box_logits, pos_mask)
+    delta = 1.0 / 9
+    box_loss = tf.losses.huber_loss(
+        pos_anchor_boxes, pos_box_logits, delta=delta,
+        reduction=tf.losses.Reduction.SUM) / delta
+    box_loss = tf.div(
+        box_loss,
+        tf.cast(nr_valid, tf.float32), name='box_loss')
+
+    for k in [label_loss, box_loss, nr_valid, nr_pos] + summaries:
+        add_moving_summary(k)
+    return label_loss, box_loss
+
+
+@under_name_scope()
+def decode_bbox_target(box_predictions, anchors):
+    """
+    Args:
+        box_predictions: fHxfWxNAx4, logits
+        anchors: fHxfWxNAx4, floatbox
+
+    Returns:
+        box_decoded: (fHxfWxNA)x4, float32
+    """
+    box_pred_txtytwth = tf.reshape(box_predictions, (-1, 2, 2))
+    box_pred_txty, box_pred_twth = tf.split(box_pred_txtytwth, 2, axis=1)
+    # each is (fHxfWxNA)x1x2
+    anchors_x1y1x2y2 = tf.reshape(anchors, (-1, 2, 2))
+    anchors_x1y1, anchors_x2y2 = tf.split(anchors_x1y1x2y2, 2, axis=1)
+
+    waha = tf.to_float(anchors_x2y2 - anchors_x1y1)
+    xaya = tf.to_float(anchors_x2y2 + anchors_x1y1) * 0.5
+
+    wbhb = tf.exp(tf.minimum(
+        box_pred_twth, np.log(config.MAX_SIZE * 1.0 / config.ANCHOR_STRIDE))) * waha
+    xbyb = box_pred_txty * waha + xaya
+    x1y1 = xbyb - wbhb * 0.5
+    x2y2 = xbyb + wbhb * 0.5
+    out = tf.squeeze(tf.concat([x1y1, x2y2], axis=2), axis=1, name='output')
+    return out
+
+@under_name_scope()
+def encode_bbox_target(boxes, anchors):
+    """
+    Args:
+        boxes: fHxfWxNAx4, float32
+        anchors: fHxfWxNAx4, float32
+
+    Returns:
+        box_encoded: fHxfWxNAx4
+    """
+    anchors_x1y1x2y2 = tf.reshape(anchors, (-1, 2, 2))
+    anchors_x1y1, anchors_x2y2 = tf.split(anchors_x1y1x2y2, 2, axis=1)
+    waha = tf.to_float(anchors_x2y2 - anchors_x1y1)
+    xaya = tf.to_float(anchors_x2y2 + anchors_x1y1) * 0.5
+
+    boxes_x1y1x2y2 = tf.reshape(boxes, (-1, 2, 2))
+    boxes_x1y1, boxes_x2y2 = tf.split(boxes_x1y1x2y2, 2, axis=1)
+    wbhb = tf.to_float(boxes_x2y2 - boxes_x1y1)
+    xbyb = tf.to_float(boxes_x2y2 + boxes_x1y1) * 0.5
+
+    # Note that here not all boxes are valid. Some may be zero
+
+    txty = (xbyb - xaya) / waha
+    twth = tf.log(wbhb / waha)  # may contain -inf for invalid boxes
+    encoded = tf.concat([txty, twth], axis=1)  # (-1x2x2)
+    return tf.reshape(encoded, tf.shape(boxes))
+
+
+@under_name_scope()
+def generate_rpn_proposals(boxes, scores, img_shape):
+    """
+    Args:
+        boxes: nx4 float dtype, decoded to floatbox already
+        scores: n float, the logits
+        img_shape: [h, w]
+
+    Returns:
+        boxes: kx4 float
+        scores: k logits
+    """
+    if get_current_tower_context().is_training:
+        PRE_NMS_TOPK = config.TRAIN_PRE_NMS_TOPK
+        POST_NMS_TOPK = config.TRAIN_POST_NMS_TOPK
+    else:
+        PRE_NMS_TOPK = config.TEST_PRE_NMS_TOPK
+        POST_NMS_TOPK = config.TEST_POST_NMS_TOPK
+
+    @under_name_scope()
+    def clip_boxes(boxes, window):
+        boxes = tf.maximum(boxes, 0.0)
+        m = tf.tile(tf.reverse(window, [0]), [2])    # (4,)
+        boxes = tf.minimum(boxes, tf.to_float(m))
+        return boxes
+
+    topk = tf.minimum(PRE_NMS_TOPK, tf.size(scores))
+    topk_scores, topk_indices = tf.nn.top_k(scores, k=topk, sorted=False)
+    topk_boxes = tf.gather(boxes, topk_indices)
+    topk_boxes = clip_boxes(topk_boxes, img_shape)
+
+    topk_boxes_x1y1x2y2 = tf.reshape(topk_boxes, (-1, 2, 2))
+    topk_boxes_x1y1, topk_boxes_x2y2 = tf.split(topk_boxes_x1y1x2y2, 2, axis=1)
+    # nx1x2 each
+    wbhb = tf.squeeze(topk_boxes_x2y2 - topk_boxes_x1y1, axis=1)
+    valid = tf.reduce_all(wbhb > config.RPN_MIN_SIZE, axis=1) #n,
+    topk_valid_boxes_x1y1x2y2 = tf.boolean_mask(topk_boxes_x1y1x2y2, valid)
+    topk_valid_scores = tf.boolean_mask(topk_scores, valid)
+
+
+    topk_valid_boxes_y1x1y2x2 = tf.reshape(
+        tf.reverse(topk_valid_boxes_x1y1x2y2, axis=[2]),
+        (-1, 4), name='nms_input_boxes')
+    nms_indices = tf.image.non_max_suppression(
+        topk_valid_boxes_y1x1y2x2,
+        topk_valid_scores,
+        max_output_size=POST_NMS_TOPK,
+        iou_threshold=config.RPN_PROPOSAL_NMS_THRESH)
+
+    topk_valid_boxes = tf.reshape(topk_valid_boxes_x1y1x2y2, (-1, 4))
+    final_boxes = tf.gather(
+        topk_valid_boxes,
+        nms_indices, name='boxes')
+    final_scores = tf.gather(topk_valid_scores, nms_indices, name='scores')
+    final_probs = tf.gather(topk_valid_scores, nms_indices, name='probs')
+    return final_boxes, final_scores
+
+
+@under_name_scope()
+def sample_fast_rcnn_targets(boxes, gt_boxes, gt_labels):
+    """
+    Args:
+        boxes: nx4 region proposals, floatbox
+        gt_boxes: mx4, floatbox
+        gt_labels: m, int32
+
+    Returns:
+        sampled_boxes: tx4 floatbox, the rois
+        target_boxes: tx4 encoded box, the regression target
+        labels: t labels
+    """
+    @under_name_scope()
+    def assign_class_to_roi(iou, gt_boxes, gt_labels):
+        """
+        Args:
+            iou: nxm (nr_proposal x nr_gt)
+        Returns:
+            fg_mask: n boolean, whether each roibox is fg
+            roi_labels: n int32, best label for each roi box
+            best_gt_boxes: nx4
+        """
+        # find best gt box for each roi box
+        best_iou_ind = tf.argmax(iou, axis=1)   # n, each in 1~m
+        best_iou = tf.reduce_max(iou, axis=1)   # n,
+        best_gt_boxes = tf.gather(gt_boxes, best_iou_ind)   #nx4
+        best_gt_labels = tf.gather(gt_labels, best_iou_ind)     # n, each in 1~C
+
+        fg_mask = best_iou >= config.FASTRCNN_FG_THRESH
+        return fg_mask, best_gt_labels, best_gt_boxes
+
+    iou = pairwise_iou(boxes, gt_boxes)     # nxm
+
+    with tf.name_scope('proposal_metrics'):
+        # find best roi for each gt, for summary only
+        best_iou = tf.reduce_max(iou, axis=0)
+        mean_best_iou = tf.reduce_mean(best_iou, name='best_iou_per_gt')
+        summaries = [mean_best_iou]
+        with tf.device('/cpu:0'):
+            for th in [0.3, 0.5]:
+                recall = tf.truediv(
+                    tf.count_nonzero(best_iou >= th),
+                    tf.size(best_iou, out_type=tf.int64),
+                    name='recall_iou{}'.format(th))
+                summaries.append(recall)
+        add_moving_summary(*summaries)
+
+    # n, n, nx4
+    fg_mask, roi_labels, best_gt_boxes = assign_class_to_roi(iou, gt_boxes, gt_labels)
+
+    # don't have to add gt for training, but add it anyway
+    fg_inds = tf.reshape(tf.where(fg_mask), [-1])
+    fg_inds = tf.concat([fg_inds,
+        tf.cast(
+            tf.range(tf.size(gt_labels)) + tf.shape(boxes)[0],
+            tf.int64)], 0)
+    num_fg = tf.size(fg_inds)
+    num_fg = tf.minimum(int(
+        config.FASTRCNN_BATCH_PER_IM * config.FASTRCNN_FG_RATIO[1]),
+        num_fg, name='num_fg')
+    fg_inds = tf.slice(tf.random_shuffle(fg_inds), [0], [num_fg])
+
+    bg_inds = tf.where(tf.logical_not(fg_mask))[:,0]
+    num_bg = tf.size(bg_inds)
+    num_bg = tf.minimum(config.FASTRCNN_BATCH_PER_IM - num_fg, num_bg)
+    num_bg = tf.minimum(
+        num_bg,
+        num_fg * int(1.0 / config.FASTRCNN_FG_RATIO[0]), name='num_bg')    # don't include too many bg
+    bg_inds = tf.slice(tf.random_shuffle(bg_inds), [0], [num_bg])
+
+    add_moving_summary(num_fg, num_bg)
+
+    all_boxes = tf.concat([boxes, gt_boxes], axis=0)
+    all_matched_gt_boxes = tf.concat([best_gt_boxes, gt_boxes], axis=0)
+    all_labels = tf.concat([roi_labels, gt_labels], axis=0)
+
+    ind_in_all = tf.concat([fg_inds, bg_inds], axis=0)   # ind in all n+m boxes
+    ret_boxes = tf.gather(all_boxes, ind_in_all, name='sampled_boxes')
+    ret_matched_gt_boxes = tf.gather(all_matched_gt_boxes, ind_in_all)
+    ret_encoded_boxes = encode_bbox_target(ret_matched_gt_boxes, ret_boxes)
+    ret_encoded_boxes = ret_encoded_boxes * tf.constant(config.FASTRCNN_BBOX_REG_WEIGHTS)
+    # bg boxes will not be trained on
+
+    ret_labels = tf.concat(
+        [tf.gather(all_labels, fg_inds),
+         tf.zeros_like(bg_inds, dtype=tf.int64)], axis=0, name='sampled_labels')
+    return ret_boxes, tf.stop_gradient(ret_encoded_boxes), tf.stop_gradient(ret_labels)
+
+
+@under_name_scope()
+def roi_align(featuremap, boxes, output_shape):
+    """
+    Args:
+        featuremap: 1xCxHxW
+        boxes: Nx4 floatbox
+        output_shape: int
+
+    Returns:
+        NxCxoHxoW
+    """
+    @under_name_scope()
+    def transform_fpcoor_for_tf(boxes, image_shape, crop_shape):
+        """
+        The way crop_and_resize works (with normalized box):
+        Initial point (the value of output[0]): x0_box * (W_img - 1)
+        Spacing: w_box * (W_img - 1) / (W_crop - 1)
+        Use the above grid to bilinear sample.
+
+        However, what I want is (with fpcoor box):
+        Spacing: w_box / W_crop
+        Initial point: x0_box + spacing/2 - 0.5
+        (-0.5 because bilinear sample assumes floating point coordinate (0.0, 0.0) is the same as pixel value (0, 0))
+
+        This function transform fpcoor boxes to a format to be used by tf.image.crop_and_resize
+
+        Returns:
+            y1x1y2x2
+        """
+        x0, y0, x1, y1 = tf.split(boxes, 4, axis=1)
+
+        spacing_w = (x1 - x0) / tf.to_float(crop_shape[1])
+        spacing_h = (y1 - y0) / tf.to_float(crop_shape[0])
+
+        nx0 = (x0 + spacing_w / 2 - 0.5) / tf.to_float(image_shape[1] - 1)
+        ny0 = (y0 + spacing_h / 2 - 0.5) / tf.to_float(image_shape[0] - 1)
+
+        nw = spacing_w * tf.to_float(crop_shape[1] - 1) / tf.to_float(image_shape[1] - 1)
+        nh = spacing_h * tf.to_float(crop_shape[0] - 1) / tf.to_float(image_shape[0] - 1)
+
+        return tf.concat([ny0, nx0, ny0 + nh, nx0 + nw], axis=1)
+
+    image_shape = tf.shape(featuremap)[2:]
+    featuremap = tf.transpose(featuremap, [0, 2, 3, 1]) # to nhwc
+    # sample 4 locations per roi bin
+    boxes = transform_fpcoor_for_tf(boxes, image_shape, [output_shape * 2, output_shape * 2])
+    boxes = tf.stop_gradient(boxes) # TODO
+    ret = tf.image.crop_and_resize(
+        featuremap, boxes, tf.zeros([tf.shape(boxes)[0]], dtype=tf.int32),
+        crop_size=[output_shape * 2, output_shape * 2])
+    ret = tf.transpose(ret, [0, 3, 1, 2])
+    ret = tf.nn.avg_pool(ret, [1, 1, 2, 2], [1, 1, 2, 2], padding='SAME', data_format='NCHW')
+    return ret
+
+
+def fastrcnn_head(feature, num_classes):
+    """
+    Args:
+        feature (NxCx1x1):
+        num_classes(int): num_category + 1
+
+    Returns:
+        cls_logits (Nxnum_class), reg_logits (Nx num_class-1 x 4)
+    """
+    with tf.variable_scope('fastrcnn'):
+        classification = FullyConnected(
+            'class', feature, num_classes,
+            W_init=tf.random_normal_initializer(stddev=0.01))
+        box_regression = FullyConnected(
+            'box', feature, (num_classes - 1) * 4,
+            W_init=tf.random_normal_initializer(stddev=0.001))
+        box_regression = tf.reshape(box_regression, (-1, num_classes - 1, 4))
+        return classification, box_regression
+
+
+@under_name_scope()
+def fastrcnn_predict_boxes(labels, box_logits):
+    """
+    Args:
+        labels: n,
+        box_logits: nx(C-1)x4
+
+    Returns:
+        fg_ind: fg, indices into n
+        fg_box_logits: fgx4
+
+    """
+    fg_ind = tf.reshape(tf.where(labels > 0), [-1])  # nfg,
+    fg_labels = tf.gather(labels, fg_ind)   # nfg,
+
+    ind_2d = tf.stack([fg_ind, fg_labels - 1], axis=1)  # nfgx2
+    # n x c-1 x 4 -> nfgx4
+    fg_box_logits = tf.gather_nd(box_logits, tf.stop_gradient(ind_2d))
+    return fg_ind, fg_box_logits
+
+@under_name_scope()
+def fastrcnn_losses(labels, boxes, label_logits, box_logits):
+    """
+    Args:
+        labels: n,
+        boxes: nx4, encoded
+        label_logits: nxC
+        box_logits: nx(C-1)x4
+    """
+    label_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=labels, logits=label_logits)
+    label_loss = tf.reduce_mean(label_loss, name='label_loss')
+    prediction = tf.argmax(label_logits, axis=1, name='label_prediction')
+    correct = tf.to_float(tf.equal(prediction, labels))  # boolean/integer gather is unavailable on GPU
+    accuracy = tf.reduce_mean(correct, name='accuracy')
+
+    # n x c-1 x 4 -> nfg x 4
+    fg_ind, fg_box_logits = fastrcnn_predict_boxes(labels, box_logits)
+    fg_boxes = tf.gather(boxes, fg_ind) # nfgx4
+
+    fg_label_pred = tf.argmax(tf.gather(label_logits, fg_ind), axis=1)
+    num_zero = tf.reduce_sum(tf.cast(tf.equal(fg_label_pred, 0), tf.int32), name='num_zero')
+    false_negative = tf.truediv(num_zero, tf.size(fg_ind), name='false_negative')
+    fg_correct = tf.gather(correct, fg_ind)
+    fg_accuracy = tf.reduce_mean(fg_correct, name='fg_accuracy')
+
+    box_loss = tf.losses.huber_loss(
+        fg_boxes, fg_box_logits, reduction=tf.losses.Reduction.SUM)
+    box_loss = tf.truediv(
+        box_loss, tf.to_float(tf.shape(labels)[0]), name='box_loss')
+
+    for k in [label_loss, box_loss, accuracy, fg_accuracy, false_negative]:
+        add_moving_summary(k)
+    return label_loss, box_loss
--- a/examples/FasterRCNN/train.py
+++ b/examples/FasterRCNN/train.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# File: train.py
+
+import sys, os
+import argparse
+import cv2
+import shutil
+import itertools
+import tqdm
+import math
+import numpy as np
+import json
+import tensorflow as tf
+
+from tensorpack import *
+import tensorpack.tfutils.symbolic_functions as symbf
+from tensorpack.tfutils.summary import add_moving_summary
+from tensorpack.tfutils import optimizer, gradproc
+import tensorpack.utils.viz as tpviz
+from tensorpack.utils.concurrency import subproc_call
+from tensorpack.utils.gpu import get_nr_gpu
+
+
+from coco import COCODetection
+from basemodel import (
+    image_preprocess, pretrained_resnet_conv4, resnet_conv5)
+from model import (
+    rpn_head, rpn_losses,
+    decode_bbox_target, encode_bbox_target,
+    generate_rpn_proposals, sample_fast_rcnn_targets,
+    roi_align, fastrcnn_head, fastrcnn_losses, fastrcnn_predict_boxes)
+from data import (
+    get_train_dataflow, get_eval_dataflow,
+    get_all_anchors)
+from viz import (
+    draw_annotation, draw_proposal_recall,
+    draw_predictions, draw_final_outputs)
+from common import clip_boxes, CustomResize, print_config
+from eval import (
+    eval_on_dataflow, detect_one_image, print_evaluation_scores, get_tf_nms,
+    nms_fastrcnn_results)
+import config
+
+
+class Model(ModelDesc):
+    def _get_inputs(self):
+        return [
+            InputDesc(tf.float32, (None, None, 3), 'image'),
+            InputDesc(tf.int32, (None, None, config.NR_ANCHOR), 'anchor_labels'),
+            InputDesc(tf.float32, (None, None, config.NR_ANCHOR, 4), 'anchor_boxes'),
+            InputDesc(tf.float32, (None, 4), 'gt_boxes'),
+            InputDesc(tf.int64, (None,), 'gt_labels'),
+        ]
+
+    def _build_graph(self, inputs):
+        is_training = get_current_tower_context().is_training
+        image, anchor_labels, anchor_boxes, gt_boxes, gt_labels = inputs
+        image = tf.expand_dims(image, 0)
+
+        # FSxFSxNAx4 (FS=MAX_SIZE//ANCHOR_STRIDE)
+        with tf.name_scope('anchors'):
+            all_anchors = tf.constant(get_all_anchors(), name='all_anchors', dtype=tf.float32)
+            fm_anchors = tf.slice(
+                all_anchors, [0, 0, 0, 0], tf.stack([
+                    tf.shape(image)[1] // config.ANCHOR_STRIDE,
+                    tf.shape(image)[2] // config.ANCHOR_STRIDE,
+                    -1, -1]), name='fm_anchors')
+            anchor_boxes_encoded = encode_bbox_target(anchor_boxes, fm_anchors)
+
+        image = image_preprocess(image, bgr=True)
+        image = tf.transpose(image, [0, 3, 1, 2])
+
+        # resnet50
+        featuremap = pretrained_resnet_conv4(image, [3, 4, 6])
+        rpn_label_logits, rpn_box_logits = rpn_head(featuremap)
+        rpn_label_loss, rpn_box_loss = rpn_losses(
+            anchor_labels, anchor_boxes_encoded, rpn_label_logits, rpn_box_logits)
+
+        decoded_boxes = decode_bbox_target(rpn_box_logits, fm_anchors) # (fHxfWxNA)x4, floatbox
+        proposal_boxes, proposal_scores = generate_rpn_proposals(
+            decoded_boxes,
+            tf.reshape(rpn_label_logits, [-1]),
+            tf.shape(image)[2:])
+
+        if is_training:
+            rcnn_sampled_boxes, rcnn_encoded_boxes, rcnn_labels = sample_fast_rcnn_targets(
+                proposal_boxes, gt_boxes, gt_labels)
+            boxes_on_featuremap = rcnn_sampled_boxes * (1.0 / config.ANCHOR_STRIDE)
+            roi_resized = roi_align(featuremap, boxes_on_featuremap, 14)
+            feature_fastrcnn = resnet_conv5(roi_resized)    #nxc
+            fastrcnn_label_logits, fastrcnn_box_logits = fastrcnn_head(feature_fastrcnn, config.NUM_CLASS)
+
+            fastrcnn_label_loss, fastrcnn_box_loss = fastrcnn_losses(
+                rcnn_labels, rcnn_encoded_boxes, fastrcnn_label_logits, fastrcnn_box_logits)
+
+            wd_cost = regularize_cost(
+                    '(?:group1|group2|group3|rpn|fastrcnn)/.*W',
+                    l2_regularizer(1e-4), name='wd_cost')
+
+            self.cost = tf.add_n([
+                rpn_label_loss, rpn_box_loss,
+                fastrcnn_label_loss, fastrcnn_box_loss,
+                wd_cost], 'total_cost')
+
+            for k in self.cost, wd_cost:
+                add_moving_summary(k)
+        else:
+            roi_resized = roi_align(featuremap, proposal_boxes * (1.0 / config.ANCHOR_STRIDE), 14)
+            feature_fastrcnn = resnet_conv5(roi_resized)    #nxc
+            label_logits, fastrcnn_box_logits = fastrcnn_head(feature_fastrcnn, config.NUM_CLASS)
+            label_probs = tf.nn.softmax(label_logits, name='fastrcnn_all_probs') # NP,
+            labels = tf.argmax(label_logits, axis=1)
+            fg_ind, fg_box_logits = fastrcnn_predict_boxes(labels, fastrcnn_box_logits)
+            fg_label_probs = tf.gather(label_probs, fg_ind, name='fastrcnn_fg_probs')
+            fg_boxes = tf.gather(proposal_boxes, fg_ind)
+
+            fg_box_logits = fg_box_logits / tf.constant(config.FASTRCNN_BBOX_REG_WEIGHTS)
+            decoded_boxes = decode_bbox_target(fg_box_logits, fg_boxes) # Nfx4, floatbox
+            decoded_boxes = tf.identity(decoded_boxes, name='fastrcnn_fg_boxes')
+
+
+    def _get_optimizer(self):
+        lr = symbf.get_scalar_var('learning_rate', 0.003, summary=True)
+        opt = tf.train.MomentumOptimizer(lr, 0.9)
+        return optimizer.apply_grad_processors(
+            opt, [gradproc.ScaleGradient(('.*/b', 2))])
+
+
+def visualize(model_path, nr_visualize=50, output_dir='output'):
+    pred = OfflinePredictor(PredictConfig(
+        model=Model(),
+        session_init=get_model_loader(model_path),
+        input_names=['image', 'gt_boxes', 'gt_labels'],
+        output_names=[
+            'generate_rpn_proposals/boxes',
+            'generate_rpn_proposals/probs',
+            'fastrcnn_all_probs',
+            'fastrcnn_fg_probs',
+            'fastrcnn_fg_boxes',
+        ]))
+    df = get_train_dataflow()
+    df.reset_state()
+
+    if os.path.isdir(output_dir):
+        shutil.rmtree(output_dir)
+    utils.fs.mkdir_p(output_dir)
+    with tqdm.tqdm(total=nr_visualize) as pbar:
+        for idx, dp in itertools.islice(enumerate(df.get_data()), nr_visualize):
+            img, _, _, gt_boxes, gt_labels = dp
+
+            rpn_boxes, rpn_scores, all_probs, fg_probs, fg_boxes = pred(img, gt_boxes, gt_labels)
+
+            gt_viz = draw_annotation(img, gt_boxes, gt_labels)
+            proposal_viz, good_proposals_ind = draw_proposal_recall(img, rpn_boxes, rpn_scores, gt_boxes)
+            score_viz = draw_predictions(img, rpn_boxes[good_proposals_ind], all_probs[good_proposals_ind])
+
+            fg_boxes = clip_boxes(fg_boxes, img.shape[:2])
+            fg_viz = draw_predictions(img, fg_boxes, fg_probs)
+
+            results = nms_fastrcnn_results(fg_boxes, fg_probs)
+            final_viz = draw_final_outputs(img, results)
+
+            viz = tpviz.stack_patches([
+                gt_viz, proposal_viz, score_viz,
+                fg_viz, final_viz], 2, 3)
+
+            if os.environ.get('DISPLAY', None):
+                tpviz.interactive_imshow(viz)
+            cv2.imwrite("{}/{:03d}.png".format(output_dir, idx), viz)
+            pbar.update()
+
+
+def offline_evaluate(model_path, output_file):
+    pred = OfflinePredictor(PredictConfig(
+        model=Model(),
+        session_init=get_model_loader(model_path),
+        input_names=['image'],
+        output_names=[
+            'fastrcnn_fg_probs',
+            'fastrcnn_fg_boxes',
+        ]))
+    df = get_eval_dataflow()
+    df = PrefetchDataZMQ(df, 1)
+    all_results = eval_on_dataflow(df, lambda img: detect_one_image(img, pred))
+    with open(output_file, 'w') as f:
+        json.dump(all_results, f)
+    print_evaluation_scores(output_file)
+
+
+def predict(model_path, input_file):
+    pred = OfflinePredictor(PredictConfig(
+        model=Model(),
+        session_init=get_model_loader(model_path),
+        input_names=['image'],
+        output_names=[
+            'fastrcnn_fg_probs',
+            'fastrcnn_fg_boxes',
+        ]))
+    img = cv2.imread(input_file, cv2.IMREAD_COLOR)
+    results = detect_one_image(img, pred)
+    final = draw_final_outputs(img, results)
+    viz = np.concatenate((img, final), axis=1)
+    tpviz.interactive_imshow(viz)
+
+
+class EvalCallback(Callback):
+    def _setup_graph(self):
+        self.pred = self.trainer.get_predictor(['image'], ['fastrcnn_fg_probs', 'fastrcnn_fg_boxes'])
+        self.df = PrefetchDataZMQ(get_eval_dataflow(), 1)
+
+        EVAL_TIMES = 5  # eval 5 times during training
+        interval = self.trainer.config.max_epoch // (EVAL_TIMES + 1)
+        self.epochs_to_eval = set([interval * k for k in range(1, EVAL_TIMES)])
+        self.epochs_to_eval.add(self.trainer.config.max_epoch)
+        get_tf_nms()    # just to make sure the nms part of graph is created
+
+    def _eval(self):
+        all_results = eval_on_dataflow(self.df, lambda img: detect_one_image(img, self.pred))
+        output_file = os.path.join(
+            logger.LOG_DIR, 'outputs{}.json'.format(self.global_step))
+        with open(output_file, 'w') as f:
+            json.dump(all_results, f)
+        print_evaluation_scores(output_file)
+
+    def _trigger_epoch(self):
+        if self.epoch_num in self.epochs_to_eval:
+            self._eval()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
+    parser.add_argument('--load', help='load model')
+    parser.add_argument('--logdir', help='logdir', default='train_log/fastrcnn')
+    parser.add_argument('--datadir', help='override config.BASEDIR')
+    parser.add_argument('--visualize', action='store_true')
+    parser.add_argument('--evaluate', help='path to the output json eval file')
+    parser.add_argument('--predict', help='path to the input image file')
+    args = parser.parse_args()
+    if args.datadir:
+        config.BASEDIR = args.datadir
+    print_config()
+
+    if args.gpu:
+        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
+    nr_gpu = get_nr_gpu()
+
+    if args.visualize:
+        assert args.load
+        visualize(args.load)
+        sys.exit()
+    if args.evaluate is not None:
+        assert args.evaluate.endswith('.json')
+        assert args.load
+        offline_evaluate(args.load, args.evaluate)
+        sys.exit()
+    if args.predict is not None:
+        COCODetection(config.BASEDIR, 'train2014')   # to load the class names
+        assert args.load
+        predict(args.load, args.predict)
+        sys.exit()
+
+
+    logger.set_logger_dir(args.logdir, 'd')
+    stepnum = 300
+    warmup_epoch = max(math.ceil(500.0 / stepnum), 5)
+    cfg = TrainConfig(
+        model=Model(),
+        dataflow=get_train_dataflow(),
+        callbacks=[
+            PeriodicTrigger(ModelSaver(), every_k_epochs=5),
+            # linear warmup
+            ScheduledHyperParamSetter('learning_rate',
+                [(0, 0.003), (warmup_epoch, 0.01)], interp='linear'),
+            # step decay
+            ScheduledHyperParamSetter('learning_rate',
+                [(warmup_epoch, 0.01), ((120000//stepnum) + warmup_epoch, 1e-3), (180000//stepnum, 1e-4)]),
+            HumanHyperParamSetter('learning_rate'),
+            EvalCallback(),
+            GPUUtilizationTracker(),
+        ],
+        steps_per_epoch=stepnum,
+        max_epoch=205000//stepnum,
+        session_init=get_model_loader(args.load),
+        nr_tower=nr_gpu
+    )
+    SyncMultiGPUTrainerReplicated(cfg, gpu_prefetch=False).train()
--- a/examples/FasterRCNN/utils/README.md
+++ b/examples/FasterRCNN/utils/README.md
+
+# Some third-party helper functions
+
+ generate_anchors.py: copied from [py-faster-rcnn](https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/generate_anchors.py).
+ box_ops.py: modified from [TF object detection API](https://github.com/tensorflow/models/blob/master/object_detection/core/box_list_ops.py).
+
--- a/examples/FasterRCNN/utils/__init__.py
+++ b/examples/FasterRCNN/utils/__init__.py
--- a/examples/FasterRCNN/utils/box_ops.py
+++ b/examples/FasterRCNN/utils/box_ops.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# File: box_ops.py
+
+import tensorflow as tf
+from tensorpack.tfutils.scope_utils import under_name_scope
+from tensorpack.tfutils import get_default_sess_config
+from tensorpack.utils.argtools import memoized
+
+"""
+This file is modified from
+https://github.com/tensorflow/models/blob/master/object_detection/core/box_list_ops.py
+"""
+
+@under_name_scope()
+def area(boxes):
+    """
+    Args:
+      boxes: nx4 floatbox
+
+    Returns:
+      n
+    """
+    x_min, y_min, x_max, y_max = tf.split(boxes, 4, axis=1)
+    return tf.squeeze((y_max - y_min) * (x_max - x_min), [1])
+
+
+@under_name_scope()
+def pairwise_intersection(boxlist1, boxlist2):
+    """Compute pairwise intersection areas between boxes.
+
+    Args:
+      boxlist1: Nx4 floatbox
+      boxlist2: Mx4
+
+    Returns:
+      a tensor with shape [N, M] representing pairwise intersections
+    """
+    x_min1, y_min1, x_max1, y_max1 = tf.split(boxlist1, 4, axis=1)
+    x_min2, y_min2, x_max2, y_max2 = tf.split(boxlist2, 4, axis=1)
+    all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(y_max2))
+    all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(y_min2))
+    intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin)
+    all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(x_max2))
+    all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(x_min2))
+    intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin)
+    return intersect_heights * intersect_widths
+
+
+@under_name_scope()
+def pairwise_iou(boxlist1, boxlist2):
+    """Computes pairwise intersection-over-union between box collections.
+
+    Args:
+      boxlist1: Nx4 floatbox
+      boxlist2: Mx4
+
+    Returns:
+      a tensor with shape [N, M] representing pairwise iou scores.
+    """
+    intersections = pairwise_intersection(boxlist1, boxlist2)
+    areas1 = area(boxlist1)
+    areas2 = area(boxlist2)
+    unions = (
+        tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections)
+    return tf.where(
+        tf.equal(intersections, 0.0),
+        tf.zeros_like(intersections), tf.truediv(intersections, unions))
+
+
+@memoized
+def get_iou_callable():
+    """
+    Get a pairwise box iou callable.
+    """
+    with tf.device('/cpu:0'):
+        A = tf.placeholder(tf.float32, shape=[None, 4])
+        B = tf.placeholder(tf.float32, shape=[None, 4])
+        iou = pairwise_iou(A, B)
+        sess = tf.Session(config=get_default_sess_config())
+        return sess.make_callable(iou, [A, B])
--- a/examples/FasterRCNN/utils/generate_anchors.py
+++ b/examples/FasterRCNN/utils/generate_anchors.py
+# https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/generate_anchors.py
+
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick and Sean Bell
+# --------------------------------------------------------
+
+from six.moves import range
+import numpy as np
+
+# Verify that we compute the same anchors as Shaoqing's matlab implementation:
+#
+#    >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat
+#    >> anchors
+#
+#    anchors =
+#
+#       -83   -39   100    56
+#      -175   -87   192   104
+#      -359  -183   376   200
+#       -55   -55    72    72
+#      -119  -119   136   136
+#      -247  -247   264   264
+#       -35   -79    52    96
+#       -79  -167    96   184
+#      -167  -343   184   360
+
+#array([[ -83.,  -39.,  100.,   56.],
+#       [-175.,  -87.,  192.,  104.],
+#       [-359., -183.,  376.,  200.],
+#       [ -55.,  -55.,   72.,   72.],
+#       [-119., -119.,  136.,  136.],
+#       [-247., -247.,  264.,  264.],
+#       [ -35.,  -79.,   52.,   96.],
+#       [ -79., -167.,   96.,  184.],
+#       [-167., -343.,  184.,  360.]])
+
+def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
+                     scales=2**np.arange(3, 6)):
+    """
+    Generate anchor (reference) windows by enumerating aspect ratios X
+    scales wrt a reference (0, 0, 15, 15) window.
+    """
+
+    base_anchor = np.array([1, 1, base_size, base_size], dtype='float32') - 1
+    ratio_anchors = _ratio_enum(base_anchor, ratios)
+    anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
+                         for i in range(ratio_anchors.shape[0])])
+    return anchors
+
+def _whctrs(anchor):
+    """
+    Return width, height, x center, and y center for an anchor (window).
+    """
+
+    w = anchor[2] - anchor[0] + 1
+    h = anchor[3] - anchor[1] + 1
+    x_ctr = anchor[0] + 0.5 * (w - 1)
+    y_ctr = anchor[1] + 0.5 * (h - 1)
+    return w, h, x_ctr, y_ctr
+
+def _mkanchors(ws, hs, x_ctr, y_ctr):
+    """
+    Given a vector of widths (ws) and heights (hs) around a center
+    (x_ctr, y_ctr), output a set of anchors (windows).
+    """
+
+    ws = ws[:, np.newaxis]
+    hs = hs[:, np.newaxis]
+    anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
+                         y_ctr - 0.5 * (hs - 1),
+                         x_ctr + 0.5 * (ws - 1),
+                         y_ctr + 0.5 * (hs - 1)))
+    return anchors
+
+def _ratio_enum(anchor, ratios):
+    """
+    Enumerate a set of anchors for each aspect ratio wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    size = w * h
+    size_ratios = size / ratios
+    ws = np.round(np.sqrt(size_ratios))
+    hs = np.round(ws * ratios)
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+def _scale_enum(anchor, scales):
+    """
+    Enumerate a set of anchors for each scale wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    ws = w * scales
+    hs = h * scales
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+if __name__ == '__main__':
+    #import time
+    #t = time.time()
+    #a = generate_anchors()
+    #print(time.time() - t)
+    #print(a)
+    #from IPython import embed; embed()
+
+    print(generate_anchors(
+                16, scales=np.asarray((2, 4, 8, 16, 32), 'float32'),
+                ratios=[0.5,1,2]))
--- a/examples/FasterRCNN/viz.py
+++ b/examples/FasterRCNN/viz.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# File: viz.py
+
+from six.moves import zip
+import numpy as np
+
+from tensorpack.utils import viz
+
+from coco import COCOMeta
+from utils.box_ops import get_iou_callable
+
+def draw_annotation(img, boxes, klass, is_crowd=None):
+    labels = []
+    assert len(boxes) == len(klass)
+    if is_crowd is not None:
+        assert len(boxes) == len(is_crowd)
+        for cls, crd in zip(klass, is_crowd):
+            clsname = COCOMeta.class_names[cls]
+            if crd == 1:
+                clsname += ';Crowd'
+            labels.append(clsname)
+    else:
+        for cls in klass:
+            labels.append(COCOMeta.class_names[cls])
+    img = viz.draw_boxes(img, boxes, labels)
+    return img
+
+
+def draw_proposal_recall(img, proposals, proposal_scores, gt_boxes):
+    """
+    Draw top3 proposals for each gt.
+    Args:
+        proposals: NPx4
+        proposal_scores: NP
+        gt_boxes: NG
+    """
+    bbox_iou_float = get_iou_callable()
+    box_ious = bbox_iou_float(gt_boxes, proposals)    #ng x np
+    box_ious_argsort = np.argsort(-box_ious, axis=1)
+    good_proposals_ind = box_ious_argsort[:,:3]   # for each gt, find 3 best proposals
+    good_proposals_ind = np.unique(good_proposals_ind.ravel())
+
+    proposals = proposals[good_proposals_ind,:]
+    tags = list(map(str, proposal_scores[good_proposals_ind]))
+    img = viz.draw_boxes(img, proposals, tags)
+    return img, good_proposals_ind
+
+
+def draw_predictions(img, boxes, scores):
+    """
+    Args:
+        boxes: kx4
+        scores: kxC
+    """
+    if len(boxes) == 0:
+        return img
+    labels = scores.argmax(axis=1)
+    scores = scores.max(axis=1)
+    tags = ["{},{:.2f}".format(COCOMeta.class_names[lb], score) for lb, score in zip(labels, scores)]
+    return viz.draw_boxes(img, boxes, tags)
+
+
+def draw_final_outputs(img, results):
+    """
+    Args:
+        results: [DetectionResult]
+    """
+    all_boxes = []
+    all_tags = []
+    for class_id, boxes, scores in results:
+        all_boxes.extend(boxes)
+        all_tags.extend(
+            ["{},{:.2f}".format(COCOMeta.class_names[class_id], sc) for sc in scores])
+    all_boxes = np.asarray(all_boxes)
+    if all_boxes.shape[0] == 0:
+        return img
+    return viz.draw_boxes(img, all_boxes, all_tags)