Add example for several embedding methods (#102)

* Add example for several embedding methods This also includes an animate script using avconv and app-util to reduce repeating code (DRY). * some docs * use average for cosine-loss. add some math. * move data to embedding_data.py; fix inference * simplify data code

Add example for several embedding methods (#102)
* Add example for several embedding methods This also includes an animate script using avconv and app-util to reduce repeating code (DRY). * some docs * use average for cosine-loss. add some math. * move data to embedding_data.py; fix inference * simplify data code
64f97425 · Patrick Wieschollek · Yuxin Wu · eb1fa920 · 64f97425 · 64f97425
Commit 64f97425 authored Jan 21, 2017 by Patrick Wieschollek Committed by Yuxin Wu Jan 21, 2017
7 changed files
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@ Docs & tutorials should be ready within a month. See some [examples](examples) t
 + [Fully-convolutional Network for Holistically-Nested Edge Detection(HED)](examples/HED)
 + [Spatial Transformer Networks on MNIST addition](examples/SpatialTransformer)
 + [Visualize Saliency Maps by Guided ReLU](examples/Saliency)
+ [Similarity Learning on MNIST](examples/SimilarityLearning)

 ### Reinforcement Learning:
 + [Deep Q-Network(DQN) variants on Atari games](examples/DeepQNetwork)

--- a/examples/SimilarityLearning/README.md
+++ b/examples/SimilarityLearning/README.md
+Various Embeddings
+==================
+
+Reproduce some embedding methods on MNIST:
+* Siamese loss in the paper [Learning a Similarity Metric Discriminatively, with Application to Face
+Verification](http://yann.lecun.com/exdb/publis/pdf/chopra-05.pdf).
+* Cosine loss
+* Triplet loss in the paper [FaceNet: A Unified Embedding for Face Recognition and Clustering](https://arxiv.org/abs/1503.03832)
+* Softmax triplet loss in the paper [Deep Metric Learning using Triplet Network](https://arxiv.org/abs/1412.6622)
+
+
+## Usage:
+```
+# to train:
+./mnist-embeddings.py --algorithm [siamese/cosine/triplet/softtriplet]
+# to visualize:
+./mnist-embeddings.py --algorithm [siamese/cosine/triplet/softtriplet] --visualize --load train_log/mnist-embeddings/checkpoint
+```
+
+<p align="center"> <img src="./results.jpg"> </p>
--- a/examples/SimilarityLearning/embedding_data.py
+++ b/examples/SimilarityLearning/embedding_data.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# File: embedding_data.py
+# Author: Yuxin Wu <ppwwyyxxc@gmail.com>
+
+import numpy as np
+from tensorpack.dataflow import dataset, BatchData
+
+
+def get_test_data(batch=128):
+    ds = dataset.Mnist('test')
+    ds = BatchData(ds, batch)
+    return ds
+
+
+def get_digits_by_label(images, labels):
+    data_dict = []
+    for clazz in range(0, 10):
+        clazz_filter = np.where(labels == clazz)
+        data_dict.append(list(images[clazz_filter].reshape((-1, 28, 28))))
+    return data_dict
+
+
+class MnistPairs(dataset.Mnist):
+    """We could also write
+
+    .. code::
+
+        ds = dataset.Mnist('train')
+        ds = JoinData([ds, ds])
+        ds = MapData(ds, lambda dp: [dp[0], dp[2], dp[1] == dp[3]])
+        ds = BatchData(ds, 128 // 2)
+
+    but then the positives pairs would be really rare (p=0.1).
+    """
+    def __init__(self, train_or_test):
+        super(MnistPairs, self).__init__(train_or_test, shuffle=False)
+        # now categorize these digits
+        self.data_dict = get_digits_by_label(self.images, self.labels)
+
+    def pick(self, label):
+        idx = self.rng.randint(len(self.data_dict[label]))
+        return self.data_dict[label][idx].astype(np.float32)
+
+    def get_data(self):
+        while True:
+            y = self.rng.randint(2)
+            if y == 0:
+                pick_label, pick_other = self.rng.choice(10, size=2, replace=False)
+            else:
+                pick_label = self.rng.randint(10)
+                pick_other = pick_label
+
+            yield [self.pick(pick_label), self.pick(pick_other), y]
+
+
+class MnistTriplets(MnistPairs):
+    def get_data(self):
+        while True:
+            pick_label, pick_other = self.rng.choice(10, size=2, replace=False)
+            yield [self.pick(pick_label), self.pick(pick_label), self.pick(pick_other)]
--- a/examples/SimilarityLearning/mnist-embeddings.py
+++ b/examples/SimilarityLearning/mnist-embeddings.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# File: mnist-embeddings.py
+# Author: PatWie <mail@patwie.com>
+
+import tensorflow as tf
+from tensorflow.python.platform import flags
+import tensorflow.contrib.slim as slim
+
+import numpy as np
+
+import matplotlib
+from matplotlib import offsetbox
+import matplotlib.pyplot as plt
+
+from tensorpack import *
+import tensorpack.tfutils.symbolic_functions as symbf
+from tensorpack.tfutils.summary import add_moving_summary
+from embedding_data import get_test_data, MnistPairs, MnistTriplets
+
+
+FLAGS = flags.FLAGS
+tf.app.flags.DEFINE_string('load', "", 'load model')
+tf.app.flags.DEFINE_integer('gpu', 0, 'used gpu')
+tf.app.flags.DEFINE_string('algorithm', "siamese", 'algorithm')
+tf.app.flags.DEFINE_boolean('visualize', False, 'show embedding')
+
+
+class EmbeddingModel(ModelDesc):
+    def embed(self, x, nfeatures=2):
+        """Embed all given tensors into an nfeatures-dim space.  """
+        list_split = 0
+        if isinstance(x, list):
+            list_split = len(x)
+            x = tf.concat_v2(x, 0)
+
+        # pre-process MNIST dataflow data
+        x = tf.expand_dims(x, 3)
+        x = x * 2 - 1
+
+        # the embedding network
+        net = slim.layers.conv2d(x, 20, 5, scope='conv1')
+        net = slim.layers.max_pool2d(net, 2, scope='pool1')
+        net = slim.layers.conv2d(net, 50, 5, scope='conv2')
+        net = slim.layers.max_pool2d(net, 2, scope='pool2')
+        net = slim.layers.flatten(net, scope='flatten3')
+        net = slim.layers.fully_connected(net, 500, scope='fully_connected4')
+        embeddings = slim.layers.fully_connected(net, nfeatures, activation_fn=None, scope='fully_connected5')
+
+        # if "x" was a list of tensors, then split the embeddings
+        if list_split > 0:
+            embeddings = tf.split(embeddings, list_split, 0)
+
+        return embeddings
+
+
+class SiameseModel(EmbeddingModel):
+    @staticmethod
+    def get_data():
+        ds = MnistPairs('train')
+        ds = BatchData(ds, 128 // 2)
+        return ds
+
+    def _get_input_vars(self):
+        return [InputVar(tf.float32, (None, 28, 28), 'input'),
+                InputVar(tf.float32, (None, 28, 28), 'input_y'),
+                InputVar(tf.int32, (None,), 'label')]
+
+    def _build_graph(self, input_vars):
+        # get inputs
+        x, y, label = input_vars
+        # embed them
+        x, y = self.embed([x, y])
+
+        # tag the embedding of 'input' with name 'emb', just for inference later on
+        with tf.variable_scope(tf.get_variable_scope(), reuse=True):
+            tf.identity(self.embed(input_vars[0]), name="emb")
+
+        # compute the actual loss
+        cost, pos_dist, neg_dist = symbf.contrastive_loss(x, y, label, 5., extra=True)
+        self.cost = tf.identity(cost, name="cost")
+
+        # track these values during training
+        add_moving_summary(pos_dist, neg_dist, self.cost)
+
+
+class CosineModel(SiameseModel):
+    def _build_graph(self, input_vars):
+        x, y, label = input_vars
+        x, y = self.embed([x, y])
+
+        with tf.variable_scope(tf.get_variable_scope(), reuse=True):
+            tf.identity(self.embed(input_vars[0]), name="emb")
+
+        cost = symbf.cosine_loss(x, y, label)
+        self.cost = tf.identity(cost, name="cost")
+        add_moving_summary(self.cost)
+
+
+class TripletModel(EmbeddingModel):
+    @staticmethod
+    def get_data():
+        ds = MnistTriplets('train')
+        ds = BatchData(ds, 128 // 3)
+        return ds
+
+    def _get_input_vars(self):
+        return [InputVar(tf.float32, (None, 28, 28), 'input'),
+                InputVar(tf.float32, (None, 28, 28), 'input_p'),
+                InputVar(tf.float32, (None, 28, 28), 'input_n')]
+
+    def loss(self, a, p, n):
+        return symbf.triplet_loss(a, p, n, 5., extra=True)
+
+    def _build_graph(self, input_vars):
+        a, p, n = input_vars
+        a, p, n = self.embed([a, p, n])
+
+        with tf.variable_scope(tf.get_variable_scope(), reuse=True):
+            tf.identity(self.embed(input_vars[0]), name="emb")
+
+        cost, pos_dist, neg_dist = self.loss(a, p, n)
+        self.cost = tf.identity(cost, name="cost")
+        add_moving_summary(pos_dist, neg_dist, self.cost)
+
+
+class SoftTripletModel(TripletModel):
+    def loss(self, a, p, n):
+        return symbf.soft_triplet_loss(a, p, n)
+
+
+def get_config(model):
+    logger.auto_set_dir()
+
+    dataset = model.get_data()
+    step_per_epoch = dataset.size()
+
+    lr = symbf.get_scalar_var('learning_rate', 1e-4, summary=True)
+
+    return TrainConfig(
+        dataflow=dataset,
+        model=model(),
+        optimizer=tf.train.GradientDescentOptimizer(lr),
+        callbacks=Callbacks([
+            StatPrinter(),
+            ModelSaver(),
+            ScheduledHyperParamSetter('learning_rate', [(10, 1e-5), (20, 1e-6)])
+        ]),
+        step_per_epoch=step_per_epoch,
+        max_epoch=20,
+    )
+
+
+def visualize(model_path, model):
+    pred = OfflinePredictor(PredictConfig(
+        session_init=get_model_loader(model_path),
+        model=model(),
+        input_names=['input'],
+        output_names=['emb']))
+
+    NUM_BATCHES = 6
+    BATCH_SIZE = 128
+    images = np.zeros((BATCH_SIZE * NUM_BATCHES, 28, 28))  # the used digits
+    embed = np.zeros((BATCH_SIZE * NUM_BATCHES, 2))  # the actual embeddings in 2-d
+
+    # get only the embedding model data (MNIST test)
+    ds = get_test_data()
+    ds.reset_state()
+
+    for offset, dp in enumerate(ds.get_data()):
+        digit, label = dp
+        prediction = pred([digit])[0]
+        embed[offset * BATCH_SIZE:offset * BATCH_SIZE + BATCH_SIZE, ...] = prediction
+        images[offset * BATCH_SIZE:offset * BATCH_SIZE + BATCH_SIZE, ...] = digit
+        offset += 1
+        if offset == NUM_BATCHES:
+            break
+
+    plt.figure()
+    ax = plt.subplot(111)
+    ax_min = np.min(embed, 0)
+    ax_max = np.max(embed, 0)
+
+    ax_dist_sq = np.sum((ax_max - ax_min)**2)
+    ax.axis('off')
+    shown_images = np.array([[1., 1.]])
+    for i in range(embed.shape[0]):
+        dist = np.sum((embed[i] - shown_images)**2, 1)
+        if np.min(dist) < 3e-4 * ax_dist_sq:     # don't show points that are too close
+            continue
+        shown_images = np.r_[shown_images, [embed[i]]]
+        imagebox = offsetbox.AnnotationBbox(offsetbox.OffsetImage(np.reshape(images[i, ...], [28, 28]),
+                                            zoom=0.6, cmap=plt.cm.gray_r), xy=embed[i], frameon=False)
+        ax.add_artist(imagebox)
+
+    plt.axis([ax_min[0], ax_max[0], ax_min[1], ax_max[1]])
+    plt.xticks([]), plt.yticks([])
+    algo_name = FLAGS.algorithm
+    plt.title('Embedding using %s-loss' % algo_name)
+    plt.savefig('%s.jpg' % algo_name)
+
+
+if __name__ == '__main__':
+    unknown = FLAGS._parse_flags()
+    assert len(unknown) == 0, "Invalid argument!"
+    assert FLAGS.algorithm in ["siamese", "cosine", "triplet", "softtriplet"]
+
+    ALGO_CONFIGS = {"siamese": SiameseModel,
+                    "cosine": CosineModel,
+                    "triplet": TripletModel,
+                    "softtriplet": SoftTripletModel}
+
+    with change_gpu(FLAGS.gpu):
+        if FLAGS.visualize:
+            visualize(FLAGS.load, ALGO_CONFIGS[FLAGS.algorithm])
+        else:
+            config = get_config(ALGO_CONFIGS[FLAGS.algorithm])
+            if FLAGS.load:
+                config.session_init = SaverRestore(FLAGS.load)
+            else:
+                SimpleTrainer(config).train()
--- a/examples/SimilarityLearning/results.jpg
+++ b/examples/SimilarityLearning/results.jpg
--- a/tensorpack/callbacks/saver.py
+++ b/tensorpack/callbacks/saver.py
@@ -78,6 +78,7 @@ due to an alternative in a different tower".format(v.name, var_dict[name].name))
                self.path,
                global_step=get_global_step(),
                write_meta_graph=False)
+            logger.info("Model saved to %s" % tf.train.get_checkpoint_state(self.checkpoint_dir).model_checkpoint_path)
        except (OSError, IOError):   # disk error sometimes.. just ignore it
            logger.exception("Exception in ModelSaver.trigger_epoch!")


--- a/tensorpack/tfutils/symbolic_functions.py
+++ b/tensorpack/tfutils/symbolic_functions.py
@@ -10,8 +10,8 @@ import numpy as np
 def prediction_incorrect(logits, label, topk=1, name='incorrect_vector'):
    """
    Args:
-        logits: (N,C)
-        label: (N,)
+        logits: shape [B,C].
+        label: shape [B].
        topk(int): topk
    Returns:
        a float32 vector of length N with 0/1 values. 1 means incorrect
@@ -154,10 +154,10 @@ def psnr_loss(prediction, ground_truth, name='psnr_loss'):

    .. math::

-        PSNR = 20 \cdot log_{10}(MAX_p) - 10 \cdot log_{10}(MSE)
+        PSNR = 20 \cdot \log_{10}(MAX_p) - 10 \cdot \log_{10}(MSE)

    This function assumes the maximum possible value of the signal is 1,
-    therefore the PSNR is simply ``- 10 * log10(MSE)``.
+    therefore the PSNR is simply :math:`- 10 \cdot \log_{10}(MSE)`.

    Args:
        prediction: a :class:`tf.Tensor` representing the prediction signal.
@@ -188,7 +188,7 @@ def guided_relu():
    from tensorflow.python.ops import gen_nn_ops   # noqa

    @tf.RegisterGradient("GuidedReLU")
-    def _GuidedReluGrad(op, grad):
+    def GuidedReluGrad(op, grad):
        return tf.where(0. < grad,
                        gen_nn_ops._relu_grad(grad, op.outputs[0]),
                        tf.zeros(grad.get_shape()))
@@ -212,3 +212,152 @@ def saliency_map(output, input, name="saliency_map"):
    saliency_op = tf.gradients(max_outp, input)[:][0]
    saliency_op = tf.identity(saliency_op, name=name)
    return saliency_op
+
+
+def contrastive_loss(left, right, y, margin, extra=False):
+    r"""Loss for Siamese networks as described in the paper:
+    `Learning a Similarity Metric Discriminatively, with Application to Face
+    Verification <http://yann.lecun.com/exdb/publis/pdf/chopra-05.pdf>`_ by Chopra et al.
+
+    .. math::
+        \frac{1}{2} [y \cdot d^2 + (1-y) \cdot \max(0, m - d)^2], d = \Vert l - r \Vert_2
+
+    Args:
+        left (tf.Tensor): left feature vectors of shape [Batch, N].
+        right (tf.Tensor): right feature vectors of shape [Batch, N].
+        y (tf.Tensor): binary labels of shape [Batch]. 1: similar, 0: not similar.
+        margin (float): horizon for negative examples (y==0).
+        extra (bool): also return distances for pos and neg.
+
+    Returns:
+        tf.Tensor: constrastive_loss (averaged over the batch), (and optionally average_pos_dist, average_neg_dist)
+    """
+    with tf.name_scope("constrastive_loss"):
+        y = tf.cast(y, tf.float32)
+
+        delta = tf.reduce_sum(tf.square(left - right), 1)
+        delta_sqrt = tf.sqrt(delta + 1e-10)
+
+        match_loss = delta
+        missmatch_loss = tf.square(tf.nn.relu(margin - delta_sqrt))
+
+        loss = tf.reduce_mean(0.5 * (y * match_loss + (1 - y) * missmatch_loss))
+
+        if extra:
+            num_pos = tf.count_nonzero(y)
+            num_neg = tf.count_nonzero(1 - y)
+            pos_dist = tf.where(tf.equal(num_pos, 0), 0.,
+                                tf.reduce_sum(y * delta_sqrt) / tf.cast(num_pos, tf.float32),
+                                name="pos-dist")
+            neg_dist = tf.where(tf.equal(num_neg, 0), 0.,
+                                tf.reduce_sum((1 - y) * delta_sqrt) / tf.cast(num_neg, tf.float32),
+                                name="neg-dist")
+            return loss, pos_dist, neg_dist
+        else:
+            return loss
+
+
+def cosine_loss(left, right, y):
+    r"""Loss for Siamese networks (cosine version).
+    Same as :func:`contrastive_loss` but with different similarity measurment.
+
+    .. math::
+        [\frac{l \cdot r}{\lVert l\rVert \lVert r\rVert} - (2y-1)]^2
+
+    Args:
+        left (tf.Tensor): left feature vectors of shape [Batch, N].
+        right (tf.Tensor): right feature vectors of shape [Batch, N].
+        y (tf.Tensor): binary labels of shape [Batch]. 1: similar, 0: not similar.
+
+    Returns:
+        tf.Tensor: cosine-loss as a scalar tensor.
+    """
+
+    def l2_norm(t, eps=1e-12):
+        """
+        Returns:
+            tf.Tensor: norm of 2D input tensor on axis 1
+        """
+        with tf.name_scope("l2_norm"):
+            return tf.sqrt(tf.reduce_sum(tf.square(t), 1) + eps)
+
+    with tf.name_scope("cosine_loss"):
+        y = 2 * tf.cast(y, tf.float32) - 1
+        pred = tf.reduce_sum(left * right, 1) / (l2_norm(left) * l2_norm(right) + 1e-10)
+
+        return tf.nn.l2_loss(y - pred) / tf.cast(tf.shape(left)[0], tf.float32)
+
+
+def triplet_loss(anchor, positive, negative, margin, extra=False):
+    r"""Loss for Triplet networks as described in the paper:
+    `FaceNet: A Unified Embedding for Face Recognition and Clustering
+    <https://arxiv.org/abs/1503.03832>`_
+    by Schroff et al.
+
+    Learn embeddings from an anchor point and a similar input (positive) as
+    well as a not-similar input (negative).
+    Intuitively, a matching pair (anchor, positive) should have a smaller relative distance
+    than a non-matching pair (anchor, negative).
+
+    .. math::
+        \max(0, m + \Vert a-p\Vert^2 - \Vert a-n\Vert^2)
+
+    Args:
+        anchor (tf.Tensor): anchor feature vectors of shape [Batch, N].
+        positive (tf.Tensor): features of positive match of the same shape.
+        negative (tf.Tensor): features of negative match of the same shape.
+        margin (float): horizont for negative examples
+        extra (bool): also return distances for pos and neg.
+
+    Returns:
+        tf.Tensor: triplet-loss as scalar (and optionally average_pos_dist, average_neg_dist)
+    """
+
+    with tf.name_scope("triplet_loss"):
+        d_pos = tf.reduce_sum(tf.square(anchor - positive), 1)
+        d_neg = tf.reduce_sum(tf.square(anchor - negative), 1)
+
+        loss = tf.reduce_mean(tf.maximum(0., margin + d_pos - d_neg))
+
+        if extra:
+            pos_dist = tf.reduce_mean(tf.sqrt(d_pos + 1e-10), name='pos-dist')
+            neg_dist = tf.reduce_mean(tf.sqrt(d_neg + 1e-10), name='neg-dist')
+            return loss, pos_dist, neg_dist
+        else:
+            return loss
+
+
+def soft_triplet_loss(anchor, positive, negative, extra=True):
+    """Loss for triplet networks as described in the paper:
+    `Deep Metric Learning using Triplet Network
+    <https://arxiv.org/abs/1412.6622>`_ by Hoffer et al.
+
+    It is a softmax loss using :math:`(anchor-positive)^2` and
+    :math:`(anchor-negative)^2` as logits.
+
+    Args:
+        anchor (tf.Tensor): anchor feature vectors of shape [Batch, N].
+        positive (tf.Tensor): features of positive match of the same shape.
+        negative (tf.Tensor): features of negative match of the same shape.
+        extra (bool): also return distances for pos and neg.
+
+    Returns:
+        tf.Tensor: triplet-loss as scalar (and optionally average_pos_dist, average_neg_dist)
+    """
+
+    eps = 1e-10
+    with tf.name_scope("soft_triplet_loss"):
+        d_pos = tf.sqrt(tf.reduce_sum(tf.square(anchor - positive), 1) + eps)
+        d_neg = tf.sqrt(tf.reduce_sum(tf.square(anchor - negative), 1) + eps)
+
+        logits = tf.stack([d_pos, d_neg], axis=1)
+        ones = tf.ones_like(tf.squeeze(d_pos), dtype="int32")
+
+        loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=ones))
+
+        if extra:
+            pos_dist = tf.reduce_mean(d_pos, name='pos-dist')
+            neg_dist = tf.reduce_mean(d_neg, name='neg-dist')
+            return loss, pos_dist, neg_dist
+        else:
+            return loss