Commit c0828b78 authored by Robert Haas's avatar Robert Haas

Move the guts of our Levenshtein implementation into core.

The hope is that we can use this to produce better diagnostics in
some cases.

Peter Geoghegan, reviewed by Michael Paquier, with some further
changes by me.
parent 1d69ae41
......@@ -17,6 +17,3 @@ top_builddir = ../..
include $(top_builddir)/src/Makefile.global
include $(top_srcdir)/contrib/contrib-global.mk
endif
# levenshtein.c is #included by fuzzystrmatch.c
fuzzystrmatch.o: fuzzystrmatch.c levenshtein.c
......@@ -154,23 +154,6 @@ getcode(char c)
/* These prevent GH from becoming F */
#define NOGHTOF(c) (getcode(c) & 16) /* BDH */
/* Faster than memcmp(), for this use case. */
static inline bool
rest_of_char_same(const char *s1, const char *s2, int len)
{
while (len > 0)
{
len--;
if (s1[len] != s2[len])
return false;
}
return true;
}
#include "levenshtein.c"
#define LEVENSHTEIN_LESS_EQUAL
#include "levenshtein.c"
PG_FUNCTION_INFO_V1(levenshtein_with_costs);
Datum
levenshtein_with_costs(PG_FUNCTION_ARGS)
......@@ -180,8 +163,20 @@ levenshtein_with_costs(PG_FUNCTION_ARGS)
int ins_c = PG_GETARG_INT32(2);
int del_c = PG_GETARG_INT32(3);
int sub_c = PG_GETARG_INT32(4);
PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c));
const char *s_data;
const char *t_data;
int s_bytes,
t_bytes;
/* Extract a pointer to the actual character data */
s_data = VARDATA_ANY(src);
t_data = VARDATA_ANY(dst);
/* Determine length of each string in bytes and characters */
s_bytes = VARSIZE_ANY_EXHDR(src);
t_bytes = VARSIZE_ANY_EXHDR(dst);
PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, ins_c,
del_c, sub_c));
}
......@@ -191,8 +186,20 @@ levenshtein(PG_FUNCTION_ARGS)
{
text *src = PG_GETARG_TEXT_PP(0);
text *dst = PG_GETARG_TEXT_PP(1);
PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1));
const char *s_data;
const char *t_data;
int s_bytes,
t_bytes;
/* Extract a pointer to the actual character data */
s_data = VARDATA_ANY(src);
t_data = VARDATA_ANY(dst);
/* Determine length of each string in bytes and characters */
s_bytes = VARSIZE_ANY_EXHDR(src);
t_bytes = VARSIZE_ANY_EXHDR(dst);
PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, 1, 1,
1));
}
......@@ -206,8 +213,21 @@ levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
int del_c = PG_GETARG_INT32(3);
int sub_c = PG_GETARG_INT32(4);
int max_d = PG_GETARG_INT32(5);
PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, max_d));
const char *s_data;
const char *t_data;
int s_bytes,
t_bytes;
/* Extract a pointer to the actual character data */
s_data = VARDATA_ANY(src);
t_data = VARDATA_ANY(dst);
/* Determine length of each string in bytes and characters */
s_bytes = VARSIZE_ANY_EXHDR(src);
t_bytes = VARSIZE_ANY_EXHDR(dst);
PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data,
t_bytes, ins_c, del_c,
sub_c, max_d));
}
......@@ -218,8 +238,20 @@ levenshtein_less_equal(PG_FUNCTION_ARGS)
text *src = PG_GETARG_TEXT_PP(0);
text *dst = PG_GETARG_TEXT_PP(1);
int max_d = PG_GETARG_INT32(2);
PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, max_d));
const char *s_data;
const char *t_data;
int s_bytes,
t_bytes;
/* Extract a pointer to the actual character data */
s_data = VARDATA_ANY(src);
t_data = VARDATA_ANY(dst);
/* Determine length of each string in bytes and characters */
s_bytes = VARSIZE_ANY_EXHDR(src);
t_bytes = VARSIZE_ANY_EXHDR(dst);
PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data,
t_bytes, 1, 1, 1, max_d));
}
......
......@@ -38,4 +38,6 @@ OBJS = acl.o arrayfuncs.o array_selfuncs.o array_typanalyze.o \
like.o: like.c like_match.c
varlena.o: varlena.c levenshtein.c
include $(top_srcdir)/src/backend/common.mk
/*
/*-------------------------------------------------------------------------
*
* levenshtein.c
* Levenshtein distance implementation.
*
* Functions for "fuzzy" comparison of strings
* Original author: Joe Conway <mail@joeconway.com>
*
* Joe Conway <mail@joeconway.com>
* This file is included by varlena.c twice, to provide matching code for (1)
* Levenshtein distance with custom costings, and (2) Levenshtein distance with
* custom costings and a "max" value above which exact distances are not
* interesting. Before the inclusion, we rely on the presence of the inline
* function rest_of_char_same().
*
* Written based on a description of the algorithm by Michael Gilleland found
* at http://www.merriampark.com/ld.htm. Also looked at levenshtein.c in the
* PHP 4.0.6 distribution for inspiration. Configurable penalty costs
* extension is introduced by Volkan YAZICI <volkan.yazici@gmail.com.
*
* Copyright (c) 2001-2014, PostgreSQL Global Development Group
* ALL RIGHTS RESERVED;
*
* levenshtein()
* -------------
* Written based on a description of the algorithm by Michael Gilleland
* found at http://www.merriampark.com/ld.htm
* Also looked at levenshtein.c in the PHP 4.0.6 distribution for
* inspiration.
* Configurable penalty costs extension is introduced by Volkan
* YAZICI <volkan.yazici@gmail.com>.
*/
/*
* External declarations for exported functions
* IDENTIFICATION
* src/backend/utils/adt/levenshtein.c
*
*-------------------------------------------------------------------------
*/
#ifdef LEVENSHTEIN_LESS_EQUAL
static int levenshtein_less_equal_internal(text *s, text *t,
int ins_c, int del_c, int sub_c, int max_d);
#else
static int levenshtein_internal(text *s, text *t,
int ins_c, int del_c, int sub_c);
#endif
#define MAX_LEVENSHTEIN_STRLEN 255
/*
* Calculates Levenshtein distance metric between supplied strings. Generally
* (1, 1, 1) penalty costs suffices for common cases, but your mileage may
* vary.
* Calculates Levenshtein distance metric between supplied csrings, which are
* not necessarily null-terminated. Generally (1, 1, 1) penalty costs suffices
* for common cases, but your mileage may vary.
*
* One way to compute Levenshtein distance is to incrementally construct
* an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
......@@ -63,30 +56,27 @@ static int levenshtein_internal(text *s, text *t,
* identify the portion of the matrix close to the diagonal which can still
* affect the final answer.
*/
static int
int
#ifdef LEVENSHTEIN_LESS_EQUAL
levenshtein_less_equal_internal(text *s, text *t,
int ins_c, int del_c, int sub_c, int max_d)
varstr_levenshtein_less_equal(const char *source, int slen, const char *target,
int tlen, int ins_c, int del_c, int sub_c,
int max_d)
#else
levenshtein_internal(text *s, text *t,
int ins_c, int del_c, int sub_c)
varstr_levenshtein(const char *source, int slen, const char *target, int tlen,
int ins_c, int del_c, int sub_c)
#endif
{
int m,
n,
s_bytes,
t_bytes;
n;
int *prev;
int *curr;
int *s_char_len = NULL;
int i,
j;
const char *s_data;
const char *t_data;
const char *y;
/*
* For levenshtein_less_equal_internal, we have real variables called
* For varstr_levenshtein_less_equal, we have real variables called
* start_column and stop_column; otherwise it's just short-hand for 0 and
* m.
*/
......@@ -105,15 +95,8 @@ levenshtein_internal(text *s, text *t,
#define STOP_COLUMN m
#endif
/* Extract a pointer to the actual character data. */
s_data = VARDATA_ANY(s);
t_data = VARDATA_ANY(t);
/* Determine length of each string in bytes and characters. */
s_bytes = VARSIZE_ANY_EXHDR(s);
t_bytes = VARSIZE_ANY_EXHDR(t);
m = pg_mbstrlen_with_len(s_data, s_bytes);
n = pg_mbstrlen_with_len(t_data, t_bytes);
m = pg_mbstrlen_with_len(source, slen);
n = pg_mbstrlen_with_len(target, tlen);
/*
* We can transform an empty s into t with n insertions, or a non-empty t
......@@ -193,10 +176,10 @@ levenshtein_internal(text *s, text *t,
* multi-byte characters, we still build the array, so that the fast-path
* needn't deal with the case where the array hasn't been initialized.
*/
if (m != s_bytes || n != t_bytes)
if (m != slen || n != tlen)
{
int i;
const char *cp = s_data;
const char *cp = source;
s_char_len = (int *) palloc((m + 1) * sizeof(int));
for (i = 0; i < m; ++i)
......@@ -223,11 +206,11 @@ levenshtein_internal(text *s, text *t,
prev[i] = i * del_c;
/* Loop through rows of the notional array */
for (y = t_data, j = 1; j < n; j++)
for (y = target, j = 1; j < n; j++)
{
int *temp;
const char *x = s_data;
int y_char_len = n != t_bytes + 1 ? pg_mblen(y) : 1;
const char *x = source;
int y_char_len = n != tlen + 1 ? pg_mblen(y) : 1;
#ifdef LEVENSHTEIN_LESS_EQUAL
......@@ -384,7 +367,7 @@ levenshtein_internal(text *s, text *t,
prev[start_column] = max_d + 1;
curr[start_column] = max_d + 1;
if (start_column != 0)
s_data += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1;
source += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1;
start_column++;
}
......
......@@ -1546,7 +1546,6 @@ varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid)
return result;
}
/* text_cmp()
* Internal comparison function for text strings.
* Returns -1, 0 or 1
......@@ -4747,3 +4746,24 @@ text_format_nv(PG_FUNCTION_ARGS)
{
return text_format(fcinfo);
}
/*
* Helper function for Levenshtein distance functions. Faster than memcmp(),
* for this use case.
*/
static inline bool
rest_of_char_same(const char *s1, const char *s2, int len)
{
while (len > 0)
{
len--;
if (s1[len] != s2[len])
return false;
}
return true;
}
/* Expand each Levenshtein distance variant */
#include "levenshtein.c"
#define LEVENSHTEIN_LESS_EQUAL
#include "levenshtein.c"
......@@ -786,6 +786,11 @@ extern Datum textoverlay_no_len(PG_FUNCTION_ARGS);
extern Datum name_text(PG_FUNCTION_ARGS);
extern Datum text_name(PG_FUNCTION_ARGS);
extern int varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid);
extern int varstr_levenshtein(const char *source, int slen, const char *target,
int tlen, int ins_c, int del_c, int sub_c);
extern int varstr_levenshtein_less_equal(const char *source, int slen,
const char *target, int tlen, int ins_c,
int del_c, int sub_c, int max_d);
extern List *textToQualifiedNameList(text *textval);
extern bool SplitIdentifierString(char *rawstring, char separator,
List **namelist);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment