Commit 9946b83d authored by Tom Lane's avatar Tom Lane

Bring SIMILAR TO and SUBSTRING into some semblance of conformance with

the SQL99 standard.  (I'm not sure that the character-class features are
quite right, but that can be fixed later.)  Document SQL99 and POSIX
regexps as being different features; provide variants of SUBSTRING for
each.
parent ac355d55
This diff is collapsed.
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/parser/gram.y,v 2.367 2002/09/18 21:35:21 tgl Exp $ * $Header: /cvsroot/pgsql/src/backend/parser/gram.y,v 2.368 2002/09/22 17:27:23 tgl Exp $
* *
* HISTORY * HISTORY
* AUTHOR DATE MAJOR EVENT * AUTHOR DATE MAJOR EVENT
...@@ -5644,22 +5644,40 @@ a_expr: c_expr { $$ = $1; } ...@@ -5644,22 +5644,40 @@ a_expr: c_expr { $$ = $1; }
} }
| a_expr SIMILAR TO a_expr %prec SIMILAR | a_expr SIMILAR TO a_expr %prec SIMILAR
{ $$ = (Node *) makeSimpleA_Expr(OP, "~", $1, $4); } {
A_Const *c = makeNode(A_Const);
FuncCall *n = makeNode(FuncCall);
c->val.type = T_Null;
n->funcname = SystemFuncName("similar_escape");
n->args = makeList2($4, (Node *) c);
n->agg_star = FALSE;
n->agg_distinct = FALSE;
$$ = (Node *) makeSimpleA_Expr(OP, "~", $1, (Node *) n);
}
| a_expr SIMILAR TO a_expr ESCAPE a_expr | a_expr SIMILAR TO a_expr ESCAPE a_expr
{ {
FuncCall *n = makeNode(FuncCall); FuncCall *n = makeNode(FuncCall);
n->funcname = SystemFuncName("like_escape"); n->funcname = SystemFuncName("similar_escape");
n->args = makeList2($4, $6); n->args = makeList2($4, $6);
n->agg_star = FALSE; n->agg_star = FALSE;
n->agg_distinct = FALSE; n->agg_distinct = FALSE;
$$ = (Node *) makeSimpleA_Expr(OP, "~", $1, (Node *) n); $$ = (Node *) makeSimpleA_Expr(OP, "~", $1, (Node *) n);
} }
| a_expr NOT SIMILAR TO a_expr %prec SIMILAR | a_expr NOT SIMILAR TO a_expr %prec SIMILAR
{ $$ = (Node *) makeSimpleA_Expr(OP, "!~", $1, $5); } {
A_Const *c = makeNode(A_Const);
FuncCall *n = makeNode(FuncCall);
c->val.type = T_Null;
n->funcname = SystemFuncName("similar_escape");
n->args = makeList2($5, (Node *) c);
n->agg_star = FALSE;
n->agg_distinct = FALSE;
$$ = (Node *) makeSimpleA_Expr(OP, "!~", $1, (Node *) n);
}
| a_expr NOT SIMILAR TO a_expr ESCAPE a_expr | a_expr NOT SIMILAR TO a_expr ESCAPE a_expr
{ {
FuncCall *n = makeNode(FuncCall); FuncCall *n = makeNode(FuncCall);
n->funcname = SystemFuncName("like_escape"); n->funcname = SystemFuncName("similar_escape");
n->args = makeList2($5, $7); n->args = makeList2($5, $7);
n->agg_star = FALSE; n->agg_star = FALSE;
n->agg_distinct = FALSE; n->agg_distinct = FALSE;
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.42 2002/09/04 20:31:28 momjian Exp $ * $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.43 2002/09/22 17:27:23 tgl Exp $
* *
* Alistair Crooks added the code for the regex caching * Alistair Crooks added the code for the regex caching
* agc - cached the regular expressions used - there's a good chance * agc - cached the regular expressions used - there's a good chance
...@@ -317,8 +317,7 @@ textregexsubstr(PG_FUNCTION_ARGS) ...@@ -317,8 +317,7 @@ textregexsubstr(PG_FUNCTION_ARGS)
char *sterm; char *sterm;
int len; int len;
bool match; bool match;
int nmatch = 1; regmatch_t pmatch[2];
regmatch_t pmatch;
/* be sure sterm is null-terminated */ /* be sure sterm is null-terminated */
len = VARSIZE(s) - VARHDRSZ; len = VARSIZE(s) - VARHDRSZ;
...@@ -327,21 +326,131 @@ textregexsubstr(PG_FUNCTION_ARGS) ...@@ -327,21 +326,131 @@ textregexsubstr(PG_FUNCTION_ARGS)
sterm[len] = '\0'; sterm[len] = '\0';
/* /*
* We need the match info back from the pattern match to be able to * We pass two regmatch_t structs to get info about the overall match
* actually extract the substring. It seems to be adequate to pass in * and the match for the first parenthesized subexpression (if any).
* a structure to return only one result. * If there is a parenthesized subexpression, we return what it matched;
* else return what the whole regexp matched.
*/ */
match = RE_compile_and_execute(p, sterm, REG_EXTENDED, nmatch, &pmatch); match = RE_compile_and_execute(p, sterm, REG_EXTENDED, 2, pmatch);
pfree(sterm); pfree(sterm);
/* match? then return the substring matching the pattern */ /* match? then return the substring matching the pattern */
if (match) if (match)
{ {
int so,
eo;
so = pmatch[1].rm_so;
eo = pmatch[1].rm_eo;
if (so < 0 || eo < 0)
{
/* no parenthesized subexpression */
so = pmatch[0].rm_so;
eo = pmatch[0].rm_eo;
}
return (DirectFunctionCall3(text_substr, return (DirectFunctionCall3(text_substr,
PointerGetDatum(s), PointerGetDatum(s),
Int32GetDatum(pmatch.rm_so + 1), Int32GetDatum(so + 1),
Int32GetDatum(pmatch.rm_eo - pmatch.rm_so))); Int32GetDatum(eo - so)));
} }
PG_RETURN_NULL(); PG_RETURN_NULL();
} }
/* similar_escape()
* Convert a SQL99 regexp pattern to POSIX style, so it can be used by
* our regexp engine.
*/
Datum
similar_escape(PG_FUNCTION_ARGS)
{
text *pat_text;
text *esc_text;
text *result;
unsigned char *p,
*e,
*r;
int plen,
elen;
bool afterescape = false;
int nquotes = 0;
/* This function is not strict, so must test explicitly */
if (PG_ARGISNULL(0))
PG_RETURN_NULL();
pat_text = PG_GETARG_TEXT_P(0);
p = VARDATA(pat_text);
plen = (VARSIZE(pat_text) - VARHDRSZ);
if (PG_ARGISNULL(1))
{
/* No ESCAPE clause provided; default to backslash as escape */
e = "\\";
elen = 1;
}
else
{
esc_text = PG_GETARG_TEXT_P(1);
e = VARDATA(esc_text);
elen = (VARSIZE(esc_text) - VARHDRSZ);
if (elen == 0)
e = NULL; /* no escape character */
else if (elen != 1)
elog(ERROR, "ESCAPE string must be empty or one character");
}
/* We need room for ^, $, and up to 2 output bytes per input byte */
result = (text *) palloc(VARHDRSZ + 2 + 2 * plen);
r = VARDATA(result);
*r++ = '^';
while (plen > 0)
{
unsigned char pchar = *p;
if (afterescape)
{
if (pchar == '"') /* for SUBSTRING patterns */
*r++ = ((nquotes++ % 2) == 0) ? '(' : ')';
else
{
*r++ = '\\';
*r++ = pchar;
}
afterescape = false;
}
else if (e && pchar == *e)
{
/* SQL99 escape character; do not send to output */
afterescape = true;
}
else if (pchar == '%')
{
*r++ = '.';
*r++ = '*';
}
else if (pchar == '_')
{
*r++ = '.';
}
else if (pchar == '\\' || pchar == '.' || pchar == '?' ||
pchar == '{')
{
*r++ = '\\';
*r++ = pchar;
}
else
{
*r++ = pchar;
}
p++, plen--;
}
*r++ = '$';
VARATT_SIZEP(result) = r - ((unsigned char *) result);
PG_RETURN_TEXT_P(result);
}
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $Id: pg_proc.h,v 1.272 2002/09/18 21:35:23 tgl Exp $ * $Id: pg_proc.h,v 1.273 2002/09/22 17:27:23 tgl Exp $
* *
* NOTES * NOTES
* The script catalog/genbki.sh reads this file and generates .bki * The script catalog/genbki.sh reads this file and generates .bki
...@@ -2076,6 +2076,9 @@ DESCR("convert int4 to char"); ...@@ -2076,6 +2076,9 @@ DESCR("convert int4 to char");
DATA(insert OID = 1622 ( repeat PGNSP PGUID 12 f f t f i 2 25 "25 23" repeat - _null_ )); DATA(insert OID = 1622 ( repeat PGNSP PGUID 12 f f t f i 2 25 "25 23" repeat - _null_ ));
DESCR("replicate string int4 times"); DESCR("replicate string int4 times");
DATA(insert OID = 1623 ( similar_escape PGNSP PGUID 12 f f f f i 2 25 "25 25" similar_escape - _null_ ));
DESCR("convert SQL99 regexp pattern to POSIX style");
DATA(insert OID = 1624 ( mul_d_interval PGNSP PGUID 12 f f t f i 2 1186 "701 1186" mul_d_interval - _null_ )); DATA(insert OID = 1624 ( mul_d_interval PGNSP PGUID 12 f f t f i 2 1186 "701 1186" mul_d_interval - _null_ ));
DATA(insert OID = 1633 ( texticlike PGNSP PGUID 12 f f t f i 2 16 "25 25" texticlike - _null_ )); DATA(insert OID = 1633 ( texticlike PGNSP PGUID 12 f f t f i 2 16 "25 25" texticlike - _null_ ));
...@@ -2087,7 +2090,7 @@ DESCR("matches LIKE expression, case-insensitive"); ...@@ -2087,7 +2090,7 @@ DESCR("matches LIKE expression, case-insensitive");
DATA(insert OID = 1636 ( nameicnlike PGNSP PGUID 12 f f t f i 2 16 "19 25" nameicnlike - _null_ )); DATA(insert OID = 1636 ( nameicnlike PGNSP PGUID 12 f f t f i 2 16 "19 25" nameicnlike - _null_ ));
DESCR("does not match LIKE expression, case-insensitive"); DESCR("does not match LIKE expression, case-insensitive");
DATA(insert OID = 1637 ( like_escape PGNSP PGUID 12 f f t f i 2 25 "25 25" like_escape - _null_ )); DATA(insert OID = 1637 ( like_escape PGNSP PGUID 12 f f t f i 2 25 "25 25" like_escape - _null_ ));
DESCR("convert match pattern to use backslash escapes"); DESCR("convert LIKE pattern to use backslash escapes");
DATA(insert OID = 1689 ( update_pg_pwd_and_pg_group PGNSP PGUID 12 f f t f v 0 2279 "" update_pg_pwd_and_pg_group - _null_ )); DATA(insert OID = 1689 ( update_pg_pwd_and_pg_group PGNSP PGUID 12 f f t f v 0 2279 "" update_pg_pwd_and_pg_group - _null_ ));
DESCR("update pg_pwd and pg_group files"); DESCR("update pg_pwd and pg_group files");
...@@ -2784,7 +2787,7 @@ DESCR("matches LIKE expression"); ...@@ -2784,7 +2787,7 @@ DESCR("matches LIKE expression");
DATA(insert OID = 2008 ( notlike PGNSP PGUID 12 f f t f i 2 16 "17 17" byteanlike - _null_ )); DATA(insert OID = 2008 ( notlike PGNSP PGUID 12 f f t f i 2 16 "17 17" byteanlike - _null_ ));
DESCR("does not match LIKE expression"); DESCR("does not match LIKE expression");
DATA(insert OID = 2009 ( like_escape PGNSP PGUID 12 f f t f i 2 17 "17 17" like_escape_bytea - _null_ )); DATA(insert OID = 2009 ( like_escape PGNSP PGUID 12 f f t f i 2 17 "17 17" like_escape_bytea - _null_ ));
DESCR("convert match pattern to use backslash escapes"); DESCR("convert LIKE pattern to use backslash escapes");
DATA(insert OID = 2010 ( length PGNSP PGUID 12 f f t f i 1 23 "17" byteaoctetlen - _null_ )); DATA(insert OID = 2010 ( length PGNSP PGUID 12 f f t f i 1 23 "17" byteaoctetlen - _null_ ));
DESCR("octet length"); DESCR("octet length");
DATA(insert OID = 2011 ( byteacat PGNSP PGUID 12 f f t f i 2 17 "17 17" byteacat - _null_ )); DATA(insert OID = 2011 ( byteacat PGNSP PGUID 12 f f t f i 2 17 "17 17" byteacat - _null_ ));
...@@ -2889,9 +2892,9 @@ DATA(insert OID = 2072 ( date_mi_interval PGNSP PGUID 14 f f t f i 2 1114 "1082 ...@@ -2889,9 +2892,9 @@ DATA(insert OID = 2072 ( date_mi_interval PGNSP PGUID 14 f f t f i 2 1114 "1082
DESCR("subtract"); DESCR("subtract");
DATA(insert OID = 2073 ( substring PGNSP PGUID 12 f f t f i 2 25 "25 25" textregexsubstr - _null_ )); DATA(insert OID = 2073 ( substring PGNSP PGUID 12 f f t f i 2 25 "25 25" textregexsubstr - _null_ ));
DESCR("substitutes regular expression"); DESCR("extracts text matching regular expression");
DATA(insert OID = 2074 ( substring PGNSP PGUID 14 f f t f i 3 25 "25 25 25" "select substring($1, like_escape($2, $3))" - _null_ )); DATA(insert OID = 2074 ( substring PGNSP PGUID 14 f f t f i 3 25 "25 25 25" "select substring($1, similar_escape($2, $3))" - _null_ ));
DESCR("substitutes regular expression with escape argument"); DESCR("extracts text matching SQL99 regular expression");
DATA(insert OID = 2075 ( bit PGNSP PGUID 12 f f t f i 1 1560 "20" bitfromint8 - _null_ )); DATA(insert OID = 2075 ( bit PGNSP PGUID 12 f f t f i 1 1560 "20" bitfromint8 - _null_ ));
DESCR("int8 to bitstring"); DESCR("int8 to bitstring");
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $Id: builtins.h,v 1.201 2002/09/19 22:48:34 tgl Exp $ * $Id: builtins.h,v 1.202 2002/09/22 17:27:25 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -371,6 +371,7 @@ extern Datum nameicregexne(PG_FUNCTION_ARGS); ...@@ -371,6 +371,7 @@ extern Datum nameicregexne(PG_FUNCTION_ARGS);
extern Datum texticregexeq(PG_FUNCTION_ARGS); extern Datum texticregexeq(PG_FUNCTION_ARGS);
extern Datum texticregexne(PG_FUNCTION_ARGS); extern Datum texticregexne(PG_FUNCTION_ARGS);
extern Datum textregexsubstr(PG_FUNCTION_ARGS); extern Datum textregexsubstr(PG_FUNCTION_ARGS);
extern Datum similar_escape(PG_FUNCTION_ARGS);
/* regproc.c */ /* regproc.c */
extern Datum regprocin(PG_FUNCTION_ARGS); extern Datum regprocin(PG_FUNCTION_ARGS);
......
...@@ -142,15 +142,15 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456"; ...@@ -142,15 +142,15 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
t t
(1 row) (1 row)
-- T581 regular expression substring -- T581 regular expression substring (with SQL99's bizarre regexp syntax)
SELECT SUBSTRING('abcdefg' FROM '(b|f).*(d)' FOR '#') AS "bcd"; SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
bcd bcd
----- -----
bcd bcd
(1 row) (1 row)
-- No match should return NULL -- No match should return NULL
SELECT SUBSTRING('abcdefg' FROM '(1|2|3)' FOR '#') IS NULL AS "True"; SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
True True
------ ------
t t
...@@ -175,8 +175,16 @@ SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True"; ...@@ -175,8 +175,16 @@ SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
t t
(1 row) (1 row)
-- PostgreSQL extention to allow omitting the escape character -- PostgreSQL extension to allow omitting the escape character;
SELECT SUBSTRING('abcdefg' FROM '(c|d).e') AS "cde"; -- here the regexp is taken as Posix syntax
SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde";
cde
-----
cde
(1 row)
-- With a parenthesized subexpression, return only what matches the subexpr
SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde";
cde cde
----- -----
cde cde
......
...@@ -62,19 +62,24 @@ SELECT SUBSTRING('1234567890' FROM 3) = '34567890' AS "34567890"; ...@@ -62,19 +62,24 @@ SELECT SUBSTRING('1234567890' FROM 3) = '34567890' AS "34567890";
SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456"; SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
-- T581 regular expression substring -- T581 regular expression substring (with SQL99's bizarre regexp syntax)
SELECT SUBSTRING('abcdefg' FROM '(b|f).*(d)' FOR '#') AS "bcd"; SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
-- No match should return NULL -- No match should return NULL
SELECT SUBSTRING('abcdefg' FROM '(1|2|3)' FOR '#') IS NULL AS "True"; SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
-- Null inputs should return NULL -- Null inputs should return NULL
SELECT SUBSTRING('abcdefg' FROM '(b|c)' FOR NULL) IS NULL AS "True"; SELECT SUBSTRING('abcdefg' FROM '(b|c)' FOR NULL) IS NULL AS "True";
SELECT SUBSTRING(NULL FROM '(b|c)' FOR '#') IS NULL AS "True"; SELECT SUBSTRING(NULL FROM '(b|c)' FOR '#') IS NULL AS "True";
SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True"; SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
-- PostgreSQL extention to allow omitting the escape character -- PostgreSQL extension to allow omitting the escape character;
SELECT SUBSTRING('abcdefg' FROM '(c|d).e') AS "cde"; -- here the regexp is taken as Posix syntax
SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde";
-- With a parenthesized subexpression, return only what matches the subexpr
SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde";
-- E021-11 position expression -- E021-11 position expression
SELECT POSITION('4' IN '1234567890') = '4' AS "4"; SELECT POSITION('4' IN '1234567890') = '4' AS "4";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment