Commit 64f97425 authored by Patrick Wieschollek's avatar Patrick Wieschollek Committed by Yuxin Wu

Add example for several embedding methods (#102)

* Add example for several embedding methods

This also includes an animate script using avconv and app-util to
reduce repeating code (DRY).

* some docs

* use average for cosine-loss. add some math.

* move data to embedding_data.py; fix inference

* simplify data code
parent eb1fa920
......@@ -12,6 +12,7 @@ Docs & tutorials should be ready within a month. See some [examples](examples) t
+ [Fully-convolutional Network for Holistically-Nested Edge Detection(HED)](examples/HED)
+ [Spatial Transformer Networks on MNIST addition](examples/SpatialTransformer)
+ [Visualize Saliency Maps by Guided ReLU](examples/Saliency)
+ [Similarity Learning on MNIST](examples/SimilarityLearning)
### Reinforcement Learning:
+ [Deep Q-Network(DQN) variants on Atari games](examples/DeepQNetwork)
......
Various Embeddings
==================
Reproduce some embedding methods on MNIST:
* Siamese loss in the paper [Learning a Similarity Metric Discriminatively, with Application to Face
Verification](http://yann.lecun.com/exdb/publis/pdf/chopra-05.pdf).
* Cosine loss
* Triplet loss in the paper [FaceNet: A Unified Embedding for Face Recognition and Clustering](https://arxiv.org/abs/1503.03832)
* Softmax triplet loss in the paper [Deep Metric Learning using Triplet Network](https://arxiv.org/abs/1412.6622)
## Usage:
```
# to train:
./mnist-embeddings.py --algorithm [siamese/cosine/triplet/softtriplet]
# to visualize:
./mnist-embeddings.py --algorithm [siamese/cosine/triplet/softtriplet] --visualize --load train_log/mnist-embeddings/checkpoint
```
<p align="center"> <img src="./results.jpg"> </p>
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: embedding_data.py
# Author: Yuxin Wu <ppwwyyxxc@gmail.com>
import numpy as np
from tensorpack.dataflow import dataset, BatchData
def get_test_data(batch=128):
ds = dataset.Mnist('test')
ds = BatchData(ds, batch)
return ds
def get_digits_by_label(images, labels):
data_dict = []
for clazz in range(0, 10):
clazz_filter = np.where(labels == clazz)
data_dict.append(list(images[clazz_filter].reshape((-1, 28, 28))))
return data_dict
class MnistPairs(dataset.Mnist):
"""We could also write
.. code::
ds = dataset.Mnist('train')
ds = JoinData([ds, ds])
ds = MapData(ds, lambda dp: [dp[0], dp[2], dp[1] == dp[3]])
ds = BatchData(ds, 128 // 2)
but then the positives pairs would be really rare (p=0.1).
"""
def __init__(self, train_or_test):
super(MnistPairs, self).__init__(train_or_test, shuffle=False)
# now categorize these digits
self.data_dict = get_digits_by_label(self.images, self.labels)
def pick(self, label):
idx = self.rng.randint(len(self.data_dict[label]))
return self.data_dict[label][idx].astype(np.float32)
def get_data(self):
while True:
y = self.rng.randint(2)
if y == 0:
pick_label, pick_other = self.rng.choice(10, size=2, replace=False)
else:
pick_label = self.rng.randint(10)
pick_other = pick_label
yield [self.pick(pick_label), self.pick(pick_other), y]
class MnistTriplets(MnistPairs):
def get_data(self):
while True:
pick_label, pick_other = self.rng.choice(10, size=2, replace=False)
yield [self.pick(pick_label), self.pick(pick_label), self.pick(pick_other)]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: mnist-embeddings.py
# Author: PatWie <mail@patwie.com>
import tensorflow as tf
from tensorflow.python.platform import flags
import tensorflow.contrib.slim as slim
import numpy as np
import matplotlib
from matplotlib import offsetbox
import matplotlib.pyplot as plt
from tensorpack import *
import tensorpack.tfutils.symbolic_functions as symbf
from tensorpack.tfutils.summary import add_moving_summary
from embedding_data import get_test_data, MnistPairs, MnistTriplets
FLAGS = flags.FLAGS
tf.app.flags.DEFINE_string('load', "", 'load model')
tf.app.flags.DEFINE_integer('gpu', 0, 'used gpu')
tf.app.flags.DEFINE_string('algorithm', "siamese", 'algorithm')
tf.app.flags.DEFINE_boolean('visualize', False, 'show embedding')
class EmbeddingModel(ModelDesc):
def embed(self, x, nfeatures=2):
"""Embed all given tensors into an nfeatures-dim space. """
list_split = 0
if isinstance(x, list):
list_split = len(x)
x = tf.concat_v2(x, 0)
# pre-process MNIST dataflow data
x = tf.expand_dims(x, 3)
x = x * 2 - 1
# the embedding network
net = slim.layers.conv2d(x, 20, 5, scope='conv1')
net = slim.layers.max_pool2d(net, 2, scope='pool1')
net = slim.layers.conv2d(net, 50, 5, scope='conv2')
net = slim.layers.max_pool2d(net, 2, scope='pool2')
net = slim.layers.flatten(net, scope='flatten3')
net = slim.layers.fully_connected(net, 500, scope='fully_connected4')
embeddings = slim.layers.fully_connected(net, nfeatures, activation_fn=None, scope='fully_connected5')
# if "x" was a list of tensors, then split the embeddings
if list_split > 0:
embeddings = tf.split(embeddings, list_split, 0)
return embeddings
class SiameseModel(EmbeddingModel):
@staticmethod
def get_data():
ds = MnistPairs('train')
ds = BatchData(ds, 128 // 2)
return ds
def _get_input_vars(self):
return [InputVar(tf.float32, (None, 28, 28), 'input'),
InputVar(tf.float32, (None, 28, 28), 'input_y'),
InputVar(tf.int32, (None,), 'label')]
def _build_graph(self, input_vars):
# get inputs
x, y, label = input_vars
# embed them
x, y = self.embed([x, y])
# tag the embedding of 'input' with name 'emb', just for inference later on
with tf.variable_scope(tf.get_variable_scope(), reuse=True):
tf.identity(self.embed(input_vars[0]), name="emb")
# compute the actual loss
cost, pos_dist, neg_dist = symbf.contrastive_loss(x, y, label, 5., extra=True)
self.cost = tf.identity(cost, name="cost")
# track these values during training
add_moving_summary(pos_dist, neg_dist, self.cost)
class CosineModel(SiameseModel):
def _build_graph(self, input_vars):
x, y, label = input_vars
x, y = self.embed([x, y])
with tf.variable_scope(tf.get_variable_scope(), reuse=True):
tf.identity(self.embed(input_vars[0]), name="emb")
cost = symbf.cosine_loss(x, y, label)
self.cost = tf.identity(cost, name="cost")
add_moving_summary(self.cost)
class TripletModel(EmbeddingModel):
@staticmethod
def get_data():
ds = MnistTriplets('train')
ds = BatchData(ds, 128 // 3)
return ds
def _get_input_vars(self):
return [InputVar(tf.float32, (None, 28, 28), 'input'),
InputVar(tf.float32, (None, 28, 28), 'input_p'),
InputVar(tf.float32, (None, 28, 28), 'input_n')]
def loss(self, a, p, n):
return symbf.triplet_loss(a, p, n, 5., extra=True)
def _build_graph(self, input_vars):
a, p, n = input_vars
a, p, n = self.embed([a, p, n])
with tf.variable_scope(tf.get_variable_scope(), reuse=True):
tf.identity(self.embed(input_vars[0]), name="emb")
cost, pos_dist, neg_dist = self.loss(a, p, n)
self.cost = tf.identity(cost, name="cost")
add_moving_summary(pos_dist, neg_dist, self.cost)
class SoftTripletModel(TripletModel):
def loss(self, a, p, n):
return symbf.soft_triplet_loss(a, p, n)
def get_config(model):
logger.auto_set_dir()
dataset = model.get_data()
step_per_epoch = dataset.size()
lr = symbf.get_scalar_var('learning_rate', 1e-4, summary=True)
return TrainConfig(
dataflow=dataset,
model=model(),
optimizer=tf.train.GradientDescentOptimizer(lr),
callbacks=Callbacks([
StatPrinter(),
ModelSaver(),
ScheduledHyperParamSetter('learning_rate', [(10, 1e-5), (20, 1e-6)])
]),
step_per_epoch=step_per_epoch,
max_epoch=20,
)
def visualize(model_path, model):
pred = OfflinePredictor(PredictConfig(
session_init=get_model_loader(model_path),
model=model(),
input_names=['input'],
output_names=['emb']))
NUM_BATCHES = 6
BATCH_SIZE = 128
images = np.zeros((BATCH_SIZE * NUM_BATCHES, 28, 28)) # the used digits
embed = np.zeros((BATCH_SIZE * NUM_BATCHES, 2)) # the actual embeddings in 2-d
# get only the embedding model data (MNIST test)
ds = get_test_data()
ds.reset_state()
for offset, dp in enumerate(ds.get_data()):
digit, label = dp
prediction = pred([digit])[0]
embed[offset * BATCH_SIZE:offset * BATCH_SIZE + BATCH_SIZE, ...] = prediction
images[offset * BATCH_SIZE:offset * BATCH_SIZE + BATCH_SIZE, ...] = digit
offset += 1
if offset == NUM_BATCHES:
break
plt.figure()
ax = plt.subplot(111)
ax_min = np.min(embed, 0)
ax_max = np.max(embed, 0)
ax_dist_sq = np.sum((ax_max - ax_min)**2)
ax.axis('off')
shown_images = np.array([[1., 1.]])
for i in range(embed.shape[0]):
dist = np.sum((embed[i] - shown_images)**2, 1)
if np.min(dist) < 3e-4 * ax_dist_sq: # don't show points that are too close
continue
shown_images = np.r_[shown_images, [embed[i]]]
imagebox = offsetbox.AnnotationBbox(offsetbox.OffsetImage(np.reshape(images[i, ...], [28, 28]),
zoom=0.6, cmap=plt.cm.gray_r), xy=embed[i], frameon=False)
ax.add_artist(imagebox)
plt.axis([ax_min[0], ax_max[0], ax_min[1], ax_max[1]])
plt.xticks([]), plt.yticks([])
algo_name = FLAGS.algorithm
plt.title('Embedding using %s-loss' % algo_name)
plt.savefig('%s.jpg' % algo_name)
if __name__ == '__main__':
unknown = FLAGS._parse_flags()
assert len(unknown) == 0, "Invalid argument!"
assert FLAGS.algorithm in ["siamese", "cosine", "triplet", "softtriplet"]
ALGO_CONFIGS = {"siamese": SiameseModel,
"cosine": CosineModel,
"triplet": TripletModel,
"softtriplet": SoftTripletModel}
with change_gpu(FLAGS.gpu):
if FLAGS.visualize:
visualize(FLAGS.load, ALGO_CONFIGS[FLAGS.algorithm])
else:
config = get_config(ALGO_CONFIGS[FLAGS.algorithm])
if FLAGS.load:
config.session_init = SaverRestore(FLAGS.load)
else:
SimpleTrainer(config).train()
......@@ -78,6 +78,7 @@ due to an alternative in a different tower".format(v.name, var_dict[name].name))
self.path,
global_step=get_global_step(),
write_meta_graph=False)
logger.info("Model saved to %s" % tf.train.get_checkpoint_state(self.checkpoint_dir).model_checkpoint_path)
except (OSError, IOError): # disk error sometimes.. just ignore it
logger.exception("Exception in ModelSaver.trigger_epoch!")
......
......@@ -10,8 +10,8 @@ import numpy as np
def prediction_incorrect(logits, label, topk=1, name='incorrect_vector'):
"""
Args:
logits: (N,C)
label: (N,)
logits: shape [B,C].
label: shape [B].
topk(int): topk
Returns:
a float32 vector of length N with 0/1 values. 1 means incorrect
......@@ -154,10 +154,10 @@ def psnr_loss(prediction, ground_truth, name='psnr_loss'):
.. math::
PSNR = 20 \cdot log_{10}(MAX_p) - 10 \cdot log_{10}(MSE)
PSNR = 20 \cdot \log_{10}(MAX_p) - 10 \cdot \log_{10}(MSE)
This function assumes the maximum possible value of the signal is 1,
therefore the PSNR is simply ``- 10 * log10(MSE)``.
therefore the PSNR is simply :math:`- 10 \cdot \log_{10}(MSE)`.
Args:
prediction: a :class:`tf.Tensor` representing the prediction signal.
......@@ -188,7 +188,7 @@ def guided_relu():
from tensorflow.python.ops import gen_nn_ops # noqa
@tf.RegisterGradient("GuidedReLU")
def _GuidedReluGrad(op, grad):
def GuidedReluGrad(op, grad):
return tf.where(0. < grad,
gen_nn_ops._relu_grad(grad, op.outputs[0]),
tf.zeros(grad.get_shape()))
......@@ -212,3 +212,152 @@ def saliency_map(output, input, name="saliency_map"):
saliency_op = tf.gradients(max_outp, input)[:][0]
saliency_op = tf.identity(saliency_op, name=name)
return saliency_op
def contrastive_loss(left, right, y, margin, extra=False):
r"""Loss for Siamese networks as described in the paper:
`Learning a Similarity Metric Discriminatively, with Application to Face
Verification <http://yann.lecun.com/exdb/publis/pdf/chopra-05.pdf>`_ by Chopra et al.
.. math::
\frac{1}{2} [y \cdot d^2 + (1-y) \cdot \max(0, m - d)^2], d = \Vert l - r \Vert_2
Args:
left (tf.Tensor): left feature vectors of shape [Batch, N].
right (tf.Tensor): right feature vectors of shape [Batch, N].
y (tf.Tensor): binary labels of shape [Batch]. 1: similar, 0: not similar.
margin (float): horizon for negative examples (y==0).
extra (bool): also return distances for pos and neg.
Returns:
tf.Tensor: constrastive_loss (averaged over the batch), (and optionally average_pos_dist, average_neg_dist)
"""
with tf.name_scope("constrastive_loss"):
y = tf.cast(y, tf.float32)
delta = tf.reduce_sum(tf.square(left - right), 1)
delta_sqrt = tf.sqrt(delta + 1e-10)
match_loss = delta
missmatch_loss = tf.square(tf.nn.relu(margin - delta_sqrt))
loss = tf.reduce_mean(0.5 * (y * match_loss + (1 - y) * missmatch_loss))
if extra:
num_pos = tf.count_nonzero(y)
num_neg = tf.count_nonzero(1 - y)
pos_dist = tf.where(tf.equal(num_pos, 0), 0.,
tf.reduce_sum(y * delta_sqrt) / tf.cast(num_pos, tf.float32),
name="pos-dist")
neg_dist = tf.where(tf.equal(num_neg, 0), 0.,
tf.reduce_sum((1 - y) * delta_sqrt) / tf.cast(num_neg, tf.float32),
name="neg-dist")
return loss, pos_dist, neg_dist
else:
return loss
def cosine_loss(left, right, y):
r"""Loss for Siamese networks (cosine version).
Same as :func:`contrastive_loss` but with different similarity measurment.
.. math::
[\frac{l \cdot r}{\lVert l\rVert \lVert r\rVert} - (2y-1)]^2
Args:
left (tf.Tensor): left feature vectors of shape [Batch, N].
right (tf.Tensor): right feature vectors of shape [Batch, N].
y (tf.Tensor): binary labels of shape [Batch]. 1: similar, 0: not similar.
Returns:
tf.Tensor: cosine-loss as a scalar tensor.
"""
def l2_norm(t, eps=1e-12):
"""
Returns:
tf.Tensor: norm of 2D input tensor on axis 1
"""
with tf.name_scope("l2_norm"):
return tf.sqrt(tf.reduce_sum(tf.square(t), 1) + eps)
with tf.name_scope("cosine_loss"):
y = 2 * tf.cast(y, tf.float32) - 1
pred = tf.reduce_sum(left * right, 1) / (l2_norm(left) * l2_norm(right) + 1e-10)
return tf.nn.l2_loss(y - pred) / tf.cast(tf.shape(left)[0], tf.float32)
def triplet_loss(anchor, positive, negative, margin, extra=False):
r"""Loss for Triplet networks as described in the paper:
`FaceNet: A Unified Embedding for Face Recognition and Clustering
<https://arxiv.org/abs/1503.03832>`_
by Schroff et al.
Learn embeddings from an anchor point and a similar input (positive) as
well as a not-similar input (negative).
Intuitively, a matching pair (anchor, positive) should have a smaller relative distance
than a non-matching pair (anchor, negative).
.. math::
\max(0, m + \Vert a-p\Vert^2 - \Vert a-n\Vert^2)
Args:
anchor (tf.Tensor): anchor feature vectors of shape [Batch, N].
positive (tf.Tensor): features of positive match of the same shape.
negative (tf.Tensor): features of negative match of the same shape.
margin (float): horizont for negative examples
extra (bool): also return distances for pos and neg.
Returns:
tf.Tensor: triplet-loss as scalar (and optionally average_pos_dist, average_neg_dist)
"""
with tf.name_scope("triplet_loss"):
d_pos = tf.reduce_sum(tf.square(anchor - positive), 1)
d_neg = tf.reduce_sum(tf.square(anchor - negative), 1)
loss = tf.reduce_mean(tf.maximum(0., margin + d_pos - d_neg))
if extra:
pos_dist = tf.reduce_mean(tf.sqrt(d_pos + 1e-10), name='pos-dist')
neg_dist = tf.reduce_mean(tf.sqrt(d_neg + 1e-10), name='neg-dist')
return loss, pos_dist, neg_dist
else:
return loss
def soft_triplet_loss(anchor, positive, negative, extra=True):
"""Loss for triplet networks as described in the paper:
`Deep Metric Learning using Triplet Network
<https://arxiv.org/abs/1412.6622>`_ by Hoffer et al.
It is a softmax loss using :math:`(anchor-positive)^2` and
:math:`(anchor-negative)^2` as logits.
Args:
anchor (tf.Tensor): anchor feature vectors of shape [Batch, N].
positive (tf.Tensor): features of positive match of the same shape.
negative (tf.Tensor): features of negative match of the same shape.
extra (bool): also return distances for pos and neg.
Returns:
tf.Tensor: triplet-loss as scalar (and optionally average_pos_dist, average_neg_dist)
"""
eps = 1e-10
with tf.name_scope("soft_triplet_loss"):
d_pos = tf.sqrt(tf.reduce_sum(tf.square(anchor - positive), 1) + eps)
d_neg = tf.sqrt(tf.reduce_sum(tf.square(anchor - negative), 1) + eps)
logits = tf.stack([d_pos, d_neg], axis=1)
ones = tf.ones_like(tf.squeeze(d_pos), dtype="int32")
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=ones))
if extra:
pos_dist = tf.reduce_mean(d_pos, name='pos-dist')
neg_dist = tf.reduce_mean(d_neg, name='neg-dist')
return loss, pos_dist, neg_dist
else:
return loss
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment