Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
Postgres FD Implementation
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Abuhujair Javed
Postgres FD Implementation
Commits
604ab081
Commit
604ab081
authored
Oct 19, 2010
by
Robert Haas
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add levenshtein_less_equal, optimized version for small distances.
Alexander Korotkov, heavily revised by me.
parent
262c1a42
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
459 additions
and
213 deletions
+459
-213
contrib/fuzzystrmatch/fuzzystrmatch.c
contrib/fuzzystrmatch/fuzzystrmatch.c
+31
-213
contrib/fuzzystrmatch/fuzzystrmatch.sql.in
contrib/fuzzystrmatch/fuzzystrmatch.sql.in
+8
-0
contrib/fuzzystrmatch/levenshtein.c
contrib/fuzzystrmatch/levenshtein.c
+397
-0
contrib/fuzzystrmatch/uninstall_fuzzystrmatch.sql
contrib/fuzzystrmatch/uninstall_fuzzystrmatch.sql
+4
-0
doc/src/sgml/fuzzystrmatch.sgml
doc/src/sgml/fuzzystrmatch.sgml
+19
-0
No files found.
contrib/fuzzystrmatch/fuzzystrmatch.c
View file @
604ab081
...
@@ -9,15 +9,6 @@
...
@@ -9,15 +9,6 @@
* Copyright (c) 2001-2010, PostgreSQL Global Development Group
* Copyright (c) 2001-2010, PostgreSQL Global Development Group
* ALL RIGHTS RESERVED;
* ALL RIGHTS RESERVED;
*
*
* levenshtein()
* -------------
* Written based on a description of the algorithm by Michael Gilleland
* found at http://www.merriampark.com/ld.htm
* Also looked at levenshtein.c in the PHP 4.0.6 distribution for
* inspiration.
* Configurable penalty costs extension is introduced by Volkan
* YAZICI <volkan.yazici@gmail.com>.
*
* metaphone()
* metaphone()
* -----------
* -----------
* Modified for PostgreSQL by Joe Conway.
* Modified for PostgreSQL by Joe Conway.
...
@@ -61,6 +52,8 @@ PG_MODULE_MAGIC;
...
@@ -61,6 +52,8 @@ PG_MODULE_MAGIC;
*/
*/
extern
Datum
levenshtein_with_costs
(
PG_FUNCTION_ARGS
);
extern
Datum
levenshtein_with_costs
(
PG_FUNCTION_ARGS
);
extern
Datum
levenshtein
(
PG_FUNCTION_ARGS
);
extern
Datum
levenshtein
(
PG_FUNCTION_ARGS
);
extern
Datum
levenshtein_less_equal_with_costs
(
PG_FUNCTION_ARGS
);
extern
Datum
levenshtein_less_equal
(
PG_FUNCTION_ARGS
);
extern
Datum
metaphone
(
PG_FUNCTION_ARGS
);
extern
Datum
metaphone
(
PG_FUNCTION_ARGS
);
extern
Datum
soundex
(
PG_FUNCTION_ARGS
);
extern
Datum
soundex
(
PG_FUNCTION_ARGS
);
extern
Datum
difference
(
PG_FUNCTION_ARGS
);
extern
Datum
difference
(
PG_FUNCTION_ARGS
);
...
@@ -85,16 +78,6 @@ soundex_code(char letter)
...
@@ -85,16 +78,6 @@ soundex_code(char letter)
return
letter
;
return
letter
;
}
}
/*
* Levenshtein
*/
#define MAX_LEVENSHTEIN_STRLEN 255
static
int
levenshtein_internal
(
text
*
s
,
text
*
t
,
int
ins_c
,
int
del_c
,
int
sub_c
);
/*
/*
* Metaphone
* Metaphone
*/
*/
...
@@ -197,224 +180,59 @@ rest_of_char_same(const char *s1, const char *s2, int len)
...
@@ -197,224 +180,59 @@ rest_of_char_same(const char *s1, const char *s2, int len)
return
true
;
return
true
;
}
}
/*
#include "levenshtein.c"
* levenshtein_internal - Calculates Levenshtein distance metric
#define LEVENSHTEIN_LESS_EQUAL
* between supplied strings. Generally
#include "levenshtein.c"
* (1, 1, 1) penalty costs suffices common
* cases, but your mileage may vary.
*/
static
int
levenshtein_internal
(
text
*
s
,
text
*
t
,
int
ins_c
,
int
del_c
,
int
sub_c
)
{
int
m
,
n
,
s_bytes
,
t_bytes
;
int
*
prev
;
int
*
curr
;
int
*
s_char_len
=
NULL
;
int
i
,
j
;
const
char
*
s_data
;
const
char
*
t_data
;
const
char
*
y
;
/* Extract a pointer to the actual character data. */
s_data
=
VARDATA_ANY
(
s
);
t_data
=
VARDATA_ANY
(
t
);
/* Determine length of each string in bytes and characters. */
s_bytes
=
VARSIZE_ANY_EXHDR
(
s
);
t_bytes
=
VARSIZE_ANY_EXHDR
(
t
);
m
=
pg_mbstrlen_with_len
(
s_data
,
s_bytes
);
n
=
pg_mbstrlen_with_len
(
t_data
,
t_bytes
);
/*
* We can transform an empty s into t with n insertions, or a non-empty t
* into an empty s with m deletions.
*/
if
(
!
m
)
return
n
*
ins_c
;
if
(
!
n
)
return
m
*
del_c
;
/*
* For security concerns, restrict excessive CPU+RAM usage. (This
* implementation uses O(m) memory and has O(mn) complexity.)
*/
if
(
m
>
MAX_LEVENSHTEIN_STRLEN
||
n
>
MAX_LEVENSHTEIN_STRLEN
)
ereport
(
ERROR
,
(
errcode
(
ERRCODE_INVALID_PARAMETER_VALUE
),
errmsg
(
"argument exceeds the maximum length of %d bytes"
,
MAX_LEVENSHTEIN_STRLEN
)));
/*
* In order to avoid calling pg_mblen() repeatedly on each character in s,
* we cache all the lengths before starting the main loop -- but if all the
* characters in both strings are single byte, then we skip this and use
* a fast-path in the main loop. If only one string contains multi-byte
* characters, we still build the array, so that the fast-path needn't
* deal with the case where the array hasn't been initialized.
*/
if
(
m
!=
s_bytes
||
n
!=
t_bytes
)
{
int
i
;
const
char
*
cp
=
s_data
;
s_char_len
=
(
int
*
)
palloc
((
m
+
1
)
*
sizeof
(
int
));
for
(
i
=
0
;
i
<
m
;
++
i
)
{
s_char_len
[
i
]
=
pg_mblen
(
cp
);
cp
+=
s_char_len
[
i
];
}
s_char_len
[
i
]
=
0
;
}
/* One more cell for initialization column and row. */
++
m
;
++
n
;
/*
PG_FUNCTION_INFO_V1
(
levenshtein_with_costs
);
* One way to compute Levenshtein distance is to incrementally construct
Datum
* an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
levenshtein_with_costs
(
PG_FUNCTION_ARGS
)
* of operations required to transform the first i characters of s into
{
* the first j characters of t. The last column of the final row is the
text
*
src
=
PG_GETARG_TEXT_PP
(
0
);
* answer.
text
*
dst
=
PG_GETARG_TEXT_PP
(
1
);
*
int
ins_c
=
PG_GETARG_INT32
(
2
);
* We use that algorithm here with some modification. In lieu of holding
int
del_c
=
PG_GETARG_INT32
(
3
);
* the entire array in memory at once, we'll just use two arrays of size
int
sub_c
=
PG_GETARG_INT32
(
4
);
* m+1 for storing accumulated values. At each step one array represents
* the "previous" row and one is the "current" row of the notional large
* array.
*/
prev
=
(
int
*
)
palloc
(
2
*
m
*
sizeof
(
int
));
curr
=
prev
+
m
;
/*
* To transform the first i characters of s into the first 0 characters
* of t, we must perform i deletions.
*/
for
(
i
=
0
;
i
<
m
;
i
++
)
prev
[
i
]
=
i
*
del_c
;
/* Loop through rows of the notional array */
for
(
y
=
t_data
,
j
=
1
;
j
<
n
;
j
++
)
{
int
*
temp
;
const
char
*
x
=
s_data
;
int
y_char_len
=
n
!=
t_bytes
+
1
?
pg_mblen
(
y
)
:
1
;
/*
* To transform the first 0 characters of s into the first j
* characters of t, we must perform j insertions.
*/
curr
[
0
]
=
j
*
ins_c
;
/*
* This inner loop is critical to performance, so we include a
* fast-path to handle the (fairly common) case where no multibyte
* characters are in the mix. The fast-path is entitled to assume
* that if s_char_len is not initialized then BOTH strings contain
* only single-byte characters.
*/
if
(
s_char_len
!=
NULL
)
{
for
(
i
=
1
;
i
<
m
;
i
++
)
{
int
ins
;
int
del
;
int
sub
;
int
x_char_len
=
s_char_len
[
i
-
1
];
/*
* Calculate costs for insertion, deletion, and substitution.
*
* When calculating cost for substitution, we compare the last
* character of each possibly-multibyte character first,
* because that's enough to rule out most mis-matches. If we
* get past that test, then we compare the lengths and the
* remaining bytes.
*/
ins
=
prev
[
i
]
+
ins_c
;
del
=
curr
[
i
-
1
]
+
del_c
;
if
(
x
[
x_char_len
-
1
]
==
y
[
y_char_len
-
1
]
&&
x_char_len
==
y_char_len
&&
(
x_char_len
==
1
||
rest_of_char_same
(
x
,
y
,
x_char_len
)))
sub
=
prev
[
i
-
1
];
else
sub
=
prev
[
i
-
1
]
+
sub_c
;
/* Take the one with minimum cost. */
curr
[
i
]
=
Min
(
ins
,
del
);
curr
[
i
]
=
Min
(
curr
[
i
],
sub
);
/* Point to next character. */
x
+=
x_char_len
;
}
}
else
{
for
(
i
=
1
;
i
<
m
;
i
++
)
{
int
ins
;
int
del
;
int
sub
;
/* Calculate costs for insertion, deletion, and substitution. */
ins
=
prev
[
i
]
+
ins_c
;
del
=
curr
[
i
-
1
]
+
del_c
;
sub
=
prev
[
i
-
1
]
+
((
*
x
==
*
y
)
?
0
:
sub_c
);
/* Take the one with minimum cost. */
curr
[
i
]
=
Min
(
ins
,
del
);
curr
[
i
]
=
Min
(
curr
[
i
],
sub
);
/* Point to next character. */
PG_RETURN_INT32
(
levenshtein_internal
(
src
,
dst
,
ins_c
,
del_c
,
sub_c
));
x
++
;
}
}
}
/* Swap current row with previous row. */
temp
=
curr
;
curr
=
prev
;
prev
=
temp
;
/* Point to next character. */
PG_FUNCTION_INFO_V1
(
levenshtein
);
y
+=
y_char_len
;
Datum
}
levenshtein
(
PG_FUNCTION_ARGS
)
{
text
*
src
=
PG_GETARG_TEXT_PP
(
0
);
text
*
dst
=
PG_GETARG_TEXT_PP
(
1
);
/*
PG_RETURN_INT32
(
levenshtein_internal
(
src
,
dst
,
1
,
1
,
1
));
* Because the final value was swapped from the previous row to the
* current row, that's where we'll find it.
*/
return
prev
[
m
-
1
];
}
}
PG_FUNCTION_INFO_V1
(
levenshtein_with_costs
);
PG_FUNCTION_INFO_V1
(
levenshtein_
less_equal_
with_costs
);
Datum
Datum
levenshtein_with_costs
(
PG_FUNCTION_ARGS
)
levenshtein_
less_equal_
with_costs
(
PG_FUNCTION_ARGS
)
{
{
text
*
src
=
PG_GETARG_TEXT_PP
(
0
);
text
*
src
=
PG_GETARG_TEXT_PP
(
0
);
text
*
dst
=
PG_GETARG_TEXT_PP
(
1
);
text
*
dst
=
PG_GETARG_TEXT_PP
(
1
);
int
ins_c
=
PG_GETARG_INT32
(
2
);
int
ins_c
=
PG_GETARG_INT32
(
2
);
int
del_c
=
PG_GETARG_INT32
(
3
);
int
del_c
=
PG_GETARG_INT32
(
3
);
int
sub_c
=
PG_GETARG_INT32
(
4
);
int
sub_c
=
PG_GETARG_INT32
(
4
);
int
max_d
=
PG_GETARG_INT32
(
5
);
PG_RETURN_INT32
(
levenshtein_
internal
(
src
,
dst
,
ins_c
,
del_c
,
sub_c
));
PG_RETURN_INT32
(
levenshtein_
less_equal_internal
(
src
,
dst
,
ins_c
,
del_c
,
sub_c
,
max_d
));
}
}
PG_FUNCTION_INFO_V1
(
levenshtein
);
PG_FUNCTION_INFO_V1
(
levenshtein
_less_equal
);
Datum
Datum
levenshtein
(
PG_FUNCTION_ARGS
)
levenshtein
_less_equal
(
PG_FUNCTION_ARGS
)
{
{
text
*
src
=
PG_GETARG_TEXT_PP
(
0
);
text
*
src
=
PG_GETARG_TEXT_PP
(
0
);
text
*
dst
=
PG_GETARG_TEXT_PP
(
1
);
text
*
dst
=
PG_GETARG_TEXT_PP
(
1
);
int
max_d
=
PG_GETARG_INT32
(
2
);
PG_RETURN_INT32
(
levenshtein_
internal
(
src
,
dst
,
1
,
1
,
1
));
PG_RETURN_INT32
(
levenshtein_
less_equal_internal
(
src
,
dst
,
1
,
1
,
1
,
max_d
));
}
}
...
...
contrib/fuzzystrmatch/fuzzystrmatch.sql.in
View file @
604ab081
...
@@ -11,6 +11,14 @@ CREATE OR REPLACE FUNCTION levenshtein (text,text,int,int,int) RETURNS int
...
@@ -11,6 +11,14 @@ CREATE OR REPLACE FUNCTION levenshtein (text,text,int,int,int) RETURNS int
AS 'MODULE_PATHNAME','levenshtein_with_costs'
AS 'MODULE_PATHNAME','levenshtein_with_costs'
LANGUAGE C IMMUTABLE STRICT;
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION levenshtein_less_equal (text,text,int) RETURNS int
AS 'MODULE_PATHNAME','levenshtein_less_equal'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION levenshtein_less_equal (text,text,int,int,int,int) RETURNS int
AS 'MODULE_PATHNAME','levenshtein_less_equal_with_costs'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION metaphone (text,int) RETURNS text
CREATE OR REPLACE FUNCTION metaphone (text,int) RETURNS text
AS 'MODULE_PATHNAME','metaphone'
AS 'MODULE_PATHNAME','metaphone'
LANGUAGE C IMMUTABLE STRICT;
LANGUAGE C IMMUTABLE STRICT;
...
...
contrib/fuzzystrmatch/levenshtein.c
0 → 100644
View file @
604ab081
/*
* levenshtein.c
*
* Functions for "fuzzy" comparison of strings
*
* Joe Conway <mail@joeconway.com>
*
* contrib/fuzzystrmatch/fuzzystrmatch.c
* Copyright (c) 2001-2010, PostgreSQL Global Development Group
* ALL RIGHTS RESERVED;
*
* levenshtein()
* -------------
* Written based on a description of the algorithm by Michael Gilleland
* found at http://www.merriampark.com/ld.htm
* Also looked at levenshtein.c in the PHP 4.0.6 distribution for
* inspiration.
* Configurable penalty costs extension is introduced by Volkan
* YAZICI <volkan.yazici@gmail.com>.
*/
/*
* External declarations for exported functions
*/
#ifdef LEVENSHTEIN_LESS_EQUAL
static
int
levenshtein_less_equal_internal
(
text
*
s
,
text
*
t
,
int
ins_c
,
int
del_c
,
int
sub_c
,
int
max_d
);
#else
static
int
levenshtein_internal
(
text
*
s
,
text
*
t
,
int
ins_c
,
int
del_c
,
int
sub_c
);
#endif
#define MAX_LEVENSHTEIN_STRLEN 255
/*
* Calculates Levenshtein distance metric between supplied strings. Generally
* (1, 1, 1) penalty costs suffices for common cases, but your mileage may
* vary.
*
* One way to compute Levenshtein distance is to incrementally construct
* an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
* of operations required to transform the first i characters of s into
* the first j characters of t. The last column of the final row is the
* answer.
*
* We use that algorithm here with some modification. In lieu of holding
* the entire array in memory at once, we'll just use two arrays of size
* m+1 for storing accumulated values. At each step one array represents
* the "previous" row and one is the "current" row of the notional large
* array.
*
* If max_d >= 0, we only need to provide an accurate answer when that answer
* is less than or equal to the bound. From any cell in the matrix, there is
* theoretical "minimum residual distance" from that cell to the last column
* of the final row. This minimum residual distance is zero when the
* untransformed portions of the strings are of equal length (because we might
* get lucky and find all the remaining characters matching) and is otherwise
* based on the minimum number of insertions or deletions needed to make them
* equal length. The residual distance grows as we move toward the upper
* right or lower left corners of the matrix. When the max_d bound is
* usefully tight, we can use this property to avoid computing the entirety
* of each row; instead, we maintain a start_column and stop_column that
* identify the portion of the matrix close to the diagonal which can still
* affect the final answer.
*/
static
int
#ifdef LEVENSHTEIN_LESS_EQUAL
levenshtein_less_equal_internal
(
text
*
s
,
text
*
t
,
int
ins_c
,
int
del_c
,
int
sub_c
,
int
max_d
)
#else
levenshtein_internal
(
text
*
s
,
text
*
t
,
int
ins_c
,
int
del_c
,
int
sub_c
)
#endif
{
int
m
,
n
,
s_bytes
,
t_bytes
;
int
*
prev
;
int
*
curr
;
int
*
s_char_len
=
NULL
;
int
i
,
j
;
const
char
*
s_data
;
const
char
*
t_data
;
const
char
*
y
;
/*
* For levenshtein_less_equal_internal, we have real variables called
* start_column and stop_column; otherwise it's just short-hand for 0
* and m.
*/
#ifdef LEVENSHTEIN_LESS_EQUAL
int
start_column
,
stop_column
;
#undef START_COLUMN
#undef STOP_COLUMN
#define START_COLUMN start_column
#define STOP_COLUMN stop_column
#else
#undef START_COLUMN
#undef STOP_COLUMN
#define START_COLUMN 0
#define STOP_COLUMN m
#endif
/* Extract a pointer to the actual character data. */
s_data
=
VARDATA_ANY
(
s
);
t_data
=
VARDATA_ANY
(
t
);
/* Determine length of each string in bytes and characters. */
s_bytes
=
VARSIZE_ANY_EXHDR
(
s
);
t_bytes
=
VARSIZE_ANY_EXHDR
(
t
);
m
=
pg_mbstrlen_with_len
(
s_data
,
s_bytes
);
n
=
pg_mbstrlen_with_len
(
t_data
,
t_bytes
);
/*
* We can transform an empty s into t with n insertions, or a non-empty t
* into an empty s with m deletions.
*/
if
(
!
m
)
return
n
*
ins_c
;
if
(
!
n
)
return
m
*
del_c
;
/*
* For security concerns, restrict excessive CPU+RAM usage. (This
* implementation uses O(m) memory and has O(mn) complexity.)
*/
if
(
m
>
MAX_LEVENSHTEIN_STRLEN
||
n
>
MAX_LEVENSHTEIN_STRLEN
)
ereport
(
ERROR
,
(
errcode
(
ERRCODE_INVALID_PARAMETER_VALUE
),
errmsg
(
"argument exceeds the maximum length of %d bytes"
,
MAX_LEVENSHTEIN_STRLEN
)));
#ifdef LEVENSHTEIN_LESS_EQUAL
/* Initialize start and stop columns. */
start_column
=
0
;
stop_column
=
m
+
1
;
/*
* If max_d >= 0, determine whether the bound is impossibly tight. If so,
* return max_d + 1 immediately. Otherwise, determine whether it's tight
* enough to limit the computation we must perform. If so, figure out
* initial stop column.
*/
if
(
max_d
>=
0
)
{
int
min_theo_d
;
/* Theoretical minimum distance. */
int
max_theo_d
;
/* Theoretical maximum distance. */
int
net_inserts
=
n
-
m
;
min_theo_d
=
net_inserts
<
0
?
-
net_inserts
*
del_c
:
net_inserts
*
ins_c
;
if
(
min_theo_d
>
max_d
)
return
max_d
+
1
;
if
(
ins_c
+
del_c
<
sub_c
)
sub_c
=
ins_c
+
del_c
;
max_theo_d
=
min_theo_d
+
sub_c
*
Min
(
m
,
n
);
if
(
max_d
>=
max_theo_d
)
max_d
=
-
1
;
else
if
(
ins_c
+
del_c
>
0
)
{
/*
* Figure out how much of the first row of the notional matrix
* we need to fill in. If the string is growing, the theoretical
* minimum distance already incorporates the cost of deleting the
* number of characters necessary to make the two strings equal
* in length. Each additional deletion forces another insertion,
* so the best-case total cost increases by ins_c + del_c.
* If the string is shrinking, the minimum theoretical cost
* assumes no excess deletions; that is, we're starting no futher
* right than column n - m. If we do start further right, the
* best-case total cost increases by ins_c + del_c for each move
* right.
*/
int
slack_d
=
max_d
-
min_theo_d
;
int
best_column
=
net_inserts
<
0
?
-
net_inserts
:
0
;
stop_column
=
best_column
+
(
slack_d
/
(
ins_c
+
del_c
))
+
1
;
if
(
stop_column
>
m
)
stop_column
=
m
+
1
;
}
}
#endif
/*
* In order to avoid calling pg_mblen() repeatedly on each character in s,
* we cache all the lengths before starting the main loop -- but if all the
* characters in both strings are single byte, then we skip this and use
* a fast-path in the main loop. If only one string contains multi-byte
* characters, we still build the array, so that the fast-path needn't
* deal with the case where the array hasn't been initialized.
*/
if
(
m
!=
s_bytes
||
n
!=
t_bytes
)
{
int
i
;
const
char
*
cp
=
s_data
;
s_char_len
=
(
int
*
)
palloc
((
m
+
1
)
*
sizeof
(
int
));
for
(
i
=
0
;
i
<
m
;
++
i
)
{
s_char_len
[
i
]
=
pg_mblen
(
cp
);
cp
+=
s_char_len
[
i
];
}
s_char_len
[
i
]
=
0
;
}
/* One more cell for initialization column and row. */
++
m
;
++
n
;
/* Previous and current rows of notional array. */
prev
=
(
int
*
)
palloc
(
2
*
m
*
sizeof
(
int
));
curr
=
prev
+
m
;
/*
* To transform the first i characters of s into the first 0 characters
* of t, we must perform i deletions.
*/
for
(
i
=
START_COLUMN
;
i
<
STOP_COLUMN
;
i
++
)
prev
[
i
]
=
i
*
del_c
;
/* Loop through rows of the notional array */
for
(
y
=
t_data
,
j
=
1
;
j
<
n
;
j
++
)
{
int
*
temp
;
const
char
*
x
=
s_data
;
int
y_char_len
=
n
!=
t_bytes
+
1
?
pg_mblen
(
y
)
:
1
;
#ifdef LEVENSHTEIN_LESS_EQUAL
/*
* In the best case, values percolate down the diagonal unchanged, so
* we must increment stop_column unless it's already on the right end
* of the array. The inner loop will read prev[stop_column], so we
* have to initialize it even though it shouldn't affect the result.
*/
if
(
stop_column
<
m
)
{
prev
[
stop_column
]
=
max_d
+
1
;
++
stop_column
;
}
/*
* The main loop fills in curr, but curr[0] needs a special case:
* to transform the first 0 characters of s into the first j
* characters of t, we must perform j insertions. However, if
* start_column > 0, this special case does not apply.
*/
if
(
start_column
==
0
)
{
curr
[
0
]
=
j
*
ins_c
;
i
=
1
;
}
else
i
=
start_column
;
#else
curr
[
0
]
=
j
*
ins_c
;
i
=
1
;
#endif
/*
* This inner loop is critical to performance, so we include a
* fast-path to handle the (fairly common) case where no multibyte
* characters are in the mix. The fast-path is entitled to assume
* that if s_char_len is not initialized then BOTH strings contain
* only single-byte characters.
*/
if
(
s_char_len
!=
NULL
)
{
for
(;
i
<
STOP_COLUMN
;
i
++
)
{
int
ins
;
int
del
;
int
sub
;
int
x_char_len
=
s_char_len
[
i
-
1
];
/*
* Calculate costs for insertion, deletion, and substitution.
*
* When calculating cost for substitution, we compare the last
* character of each possibly-multibyte character first,
* because that's enough to rule out most mis-matches. If we
* get past that test, then we compare the lengths and the
* remaining bytes.
*/
ins
=
prev
[
i
]
+
ins_c
;
del
=
curr
[
i
-
1
]
+
del_c
;
if
(
x
[
x_char_len
-
1
]
==
y
[
y_char_len
-
1
]
&&
x_char_len
==
y_char_len
&&
(
x_char_len
==
1
||
rest_of_char_same
(
x
,
y
,
x_char_len
)))
sub
=
prev
[
i
-
1
];
else
sub
=
prev
[
i
-
1
]
+
sub_c
;
/* Take the one with minimum cost. */
curr
[
i
]
=
Min
(
ins
,
del
);
curr
[
i
]
=
Min
(
curr
[
i
],
sub
);
/* Point to next character. */
x
+=
x_char_len
;
}
}
else
{
for
(;
i
<
STOP_COLUMN
;
i
++
)
{
int
ins
;
int
del
;
int
sub
;
/* Calculate costs for insertion, deletion, and substitution. */
ins
=
prev
[
i
]
+
ins_c
;
del
=
curr
[
i
-
1
]
+
del_c
;
sub
=
prev
[
i
-
1
]
+
((
*
x
==
*
y
)
?
0
:
sub_c
);
/* Take the one with minimum cost. */
curr
[
i
]
=
Min
(
ins
,
del
);
curr
[
i
]
=
Min
(
curr
[
i
],
sub
);
/* Point to next character. */
x
++
;
}
}
/* Swap current row with previous row. */
temp
=
curr
;
curr
=
prev
;
prev
=
temp
;
/* Point to next character. */
y
+=
y_char_len
;
#ifdef LEVENSHTEIN_LESS_EQUAL
/*
* This chunk of code represents a significant performance hit if used
* in the case where there is no max_d bound. This is probably not
* because the max_d >= 0 test itself is expensive, but rather because
* the possibility of needing to execute this code prevents tight
* optimization of the loop as a whole.
*/
if
(
max_d
>=
0
)
{
/*
* The "zero point" is the column of the current row where the
* remaining portions of the strings are of equal length. There
* are (n - 1) characters in the target string, of which j have
* been transformed. There are (m - 1) characters in the source
* string, so we want to find the value for zp where where (n - 1)
* - j = (m - 1) - zp.
*/
int
zp
=
j
-
(
n
-
m
);
/* Check whether the stop column can slide left. */
while
(
stop_column
>
0
)
{
int
ii
=
stop_column
-
1
;
int
net_inserts
=
ii
-
zp
;
if
(
prev
[
ii
]
+
(
net_inserts
>
0
?
net_inserts
*
ins_c
:
-
net_inserts
*
del_c
)
<=
max_d
)
break
;
stop_column
--
;
}
/* Check whether the start column can slide right. */
while
(
start_column
<
stop_column
)
{
int
net_inserts
=
start_column
-
zp
;
if
(
prev
[
start_column
]
+
(
net_inserts
>
0
?
net_inserts
*
ins_c
:
-
net_inserts
*
del_c
)
<=
max_d
)
break
;
/*
* We'll never again update these values, so we must make
* sure there's nothing here that could confuse any future
* iteration of the outer loop.
*/
prev
[
start_column
]
=
max_d
+
1
;
curr
[
start_column
]
=
max_d
+
1
;
if
(
start_column
!=
0
)
s_data
+=
n
!=
t_bytes
+
1
?
pg_mblen
(
s_data
)
:
1
;
start_column
++
;
}
/* If they cross, we're going to exceed the bound. */
if
(
start_column
>=
stop_column
)
return
max_d
+
1
;
}
#endif
}
/*
* Because the final value was swapped from the previous row to the
* current row, that's where we'll find it.
*/
return
prev
[
m
-
1
];
}
contrib/fuzzystrmatch/uninstall_fuzzystrmatch.sql
View file @
604ab081
...
@@ -18,3 +18,7 @@ DROP FUNCTION metaphone (text,int);
...
@@ -18,3 +18,7 @@ DROP FUNCTION metaphone (text,int);
DROP
FUNCTION
levenshtein
(
text
,
text
,
int
,
int
,
int
);
DROP
FUNCTION
levenshtein
(
text
,
text
,
int
,
int
,
int
);
DROP
FUNCTION
levenshtein
(
text
,
text
);
DROP
FUNCTION
levenshtein
(
text
,
text
);
DROP
FUNCTION
levenshtein_less_equal
(
text
,
text
,
int
);
DROP
FUNCTION
levenshtein_less_equal
(
text
,
text
,
int
,
int
,
int
,
int
);
doc/src/sgml/fuzzystrmatch.sgml
View file @
604ab081
...
@@ -84,6 +84,8 @@ SELECT * FROM s WHERE difference(s.nm, 'john') > 2;
...
@@ -84,6 +84,8 @@ SELECT * FROM s WHERE difference(s.nm, 'john') > 2;
<synopsis>
<synopsis>
levenshtein(text source, text target, int ins_cost, int del_cost, int sub_cost) returns int
levenshtein(text source, text target, int ins_cost, int del_cost, int sub_cost) returns int
levenshtein(text source, text target) returns int
levenshtein(text source, text target) returns int
levenshtein_less_equal(text source, text target, int ins_cost, int del_cost, int sub_cost, int max_d) returns int
levenshtein_less_equal(text source, text target, int max_d) returns int
</synopsis>
</synopsis>
<para>
<para>
...
@@ -92,6 +94,11 @@ levenshtein(text source, text target) returns int
...
@@ -92,6 +94,11 @@ levenshtein(text source, text target) returns int
specify how much to charge for a character insertion, deletion, or
specify how much to charge for a character insertion, deletion, or
substitution, respectively. You can omit the cost parameters, as in
substitution, respectively. You can omit the cost parameters, as in
the second version of the function; in that case they all default to 1.
the second version of the function; in that case they all default to 1.
<literal>levenshtein_less_equal</literal> is accelerated version of
levenshtein functon for low values of distance. If actual distance
is less or equal then max_d, then <literal>levenshtein_less_equal</literal>
returns accurate value of it. Otherwise this function returns value
which is greater than max_d.
</para>
</para>
<para>
<para>
...
@@ -110,6 +117,18 @@ test=# SELECT levenshtein('GUMBO', 'GAMBOL', 2,1,1);
...
@@ -110,6 +117,18 @@ test=# SELECT levenshtein('GUMBO', 'GAMBOL', 2,1,1);
-------------
-------------
3
3
(1 row)
(1 row)
test=# SELECT levenshtein_less_equal('extensive', 'exhaustive',2);
levenshtein_less_equal
------------------------
3
(1 row)
test=# SELECT levenshtein_less_equal('extensive', 'exhaustive',4);
levenshtein_less_equal
------------------------
4
(1 row)
</screen>
</screen>
</sect2>
</sect2>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment