Commit d6061d2f authored by Tom Lane's avatar Tom Lane

Fix regex_fixed_prefix() to cope reasonably well with regex patterns of the

form '^(foo)$'.  Before, these could never be optimized into indexscans.
The recent changes to make psql and pg_dump generate such patterns (for \d
commands and -t and related switches, respectively) therefore represented
a big performance hit for people with large pg_class catalogs, as seen in
recent gripe from Erik Jones.  While at it, be more paranoid about
case-sensitivity checking in multibyte encodings, and fix some other
corner cases in which a regex might be interpreted too liberally.
parent 9c88830e
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.66 2006/10/04 00:29:59 momjian Exp $ * $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.67 2007/01/03 22:39:26 tgl Exp $
* *
* Alistair Crooks added the code for the regex caching * Alistair Crooks added the code for the regex caching
* agc - cached the regular expressions used - there's a good chance * agc - cached the regular expressions used - there's a good chance
...@@ -624,3 +624,12 @@ similar_escape(PG_FUNCTION_ARGS) ...@@ -624,3 +624,12 @@ similar_escape(PG_FUNCTION_ARGS)
PG_RETURN_TEXT_P(result); PG_RETURN_TEXT_P(result);
} }
/*
* report whether regex_flavor is currently BASIC
*/
bool
regex_flavor_is_basic(void)
{
return (regex_flavor == REG_BASIC);
}
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.216 2006/12/23 00:43:11 tgl Exp $ * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.217 2007/01/03 22:39:26 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -3805,7 +3805,10 @@ get_variable_maximum(PlannerInfo *root, VariableStatData *vardata, ...@@ -3805,7 +3805,10 @@ get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
* These routines support analysis of LIKE and regular-expression patterns * These routines support analysis of LIKE and regular-expression patterns
* by the planner/optimizer. It's important that they agree with the * by the planner/optimizer. It's important that they agree with the
* regular-expression code in backend/regex/ and the LIKE code in * regular-expression code in backend/regex/ and the LIKE code in
* backend/utils/adt/like.c. * backend/utils/adt/like.c. Also, the computation of the fixed prefix
* must be conservative: if we report a string longer than the true fixed
* prefix, the query may produce actually wrong answers, rather than just
* getting a bad selectivity estimate!
* *
* Note that the prefix-analysis functions are called from * Note that the prefix-analysis functions are called from
* backend/optimizer/path/indxpath.c as well as from routines in this file. * backend/optimizer/path/indxpath.c as well as from routines in this file.
...@@ -3837,6 +3840,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, ...@@ -3837,6 +3840,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
Oid typeid = patt_const->consttype; Oid typeid = patt_const->consttype;
int pos, int pos,
match_pos; match_pos;
bool is_multibyte = (pg_database_encoding_max_length() > 1);
/* the right-hand const is type text or bytea */ /* the right-hand const is type text or bytea */
Assert(typeid == BYTEAOID || typeid == TEXTOID); Assert(typeid == BYTEAOID || typeid == TEXTOID);
...@@ -3880,11 +3884,16 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, ...@@ -3880,11 +3884,16 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
} }
/* /*
* XXX I suspect isalpha() is not an adequately locale-sensitive test * XXX In multibyte character sets, we can't trust isalpha, so assume
* for characters that can vary under case folding? * any multibyte char is potentially case-varying.
*/ */
if (case_insensitive && isalpha((unsigned char) patt[pos])) if (case_insensitive)
break; {
if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
break;
if (isalpha((unsigned char) patt[pos]))
break;
}
/* /*
* NOTE: this code used to think that %% meant a literal %, but * NOTE: this code used to think that %% meant a literal %, but
...@@ -3929,11 +3938,13 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, ...@@ -3929,11 +3938,13 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
int pos, int pos,
match_pos, match_pos,
prev_pos, prev_pos,
prev_match_pos, prev_match_pos;
paren_depth; bool have_leading_paren;
char *patt; char *patt;
char *rest; char *rest;
Oid typeid = patt_const->consttype; Oid typeid = patt_const->consttype;
bool is_basic = regex_flavor_is_basic();
bool is_multibyte = (pg_database_encoding_max_length() > 1);
/* /*
* Should be unnecessary, there are no bytea regex operators defined. As * Should be unnecessary, there are no bytea regex operators defined. As
...@@ -3948,8 +3959,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, ...@@ -3948,8 +3959,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
/* the right-hand const is type text for all of these */ /* the right-hand const is type text for all of these */
patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue)); patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
/*
* Check for ARE director prefix. It's worth our trouble to recognize
* this because similar_escape() uses it.
*/
pos = 0;
if (strncmp(patt, "***:", 4) == 0)
{
pos = 4;
is_basic = false;
}
/* Pattern must be anchored left */ /* Pattern must be anchored left */
if (patt[0] != '^') if (patt[pos] != '^')
{ {
rest = patt; rest = patt;
...@@ -3958,72 +3980,86 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, ...@@ -3958,72 +3980,86 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
return Pattern_Prefix_None; return Pattern_Prefix_None;
} }
pos++;
/* /*
* If unquoted | is present at paren level 0 in pattern, then there are * If '|' is present in pattern, then there may be multiple alternatives
* multiple alternatives for the start of the string. * for the start of the string. (There are cases where this isn't so,
* for instance if the '|' is inside parens, but detecting that reliably
* is too hard.)
*/ */
paren_depth = 0; if (strchr(patt + pos, '|') != NULL)
for (pos = 1; patt[pos]; pos++)
{ {
if (patt[pos] == '|' && paren_depth == 0) rest = patt;
{
rest = patt;
*prefix_const = NULL; *prefix_const = NULL;
*rest_const = string_to_const(rest, typeid); *rest_const = string_to_const(rest, typeid);
return Pattern_Prefix_None; return Pattern_Prefix_None;
}
else if (patt[pos] == '(')
paren_depth++;
else if (patt[pos] == ')' && paren_depth > 0)
paren_depth--;
else if (patt[pos] == '\\')
{
/* backslash quotes the next character */
pos++;
if (patt[pos] == '\0')
break;
}
} }
/* OK, allocate space for pattern */ /* OK, allocate space for pattern */
match = palloc(strlen(patt) + 1); match = palloc(strlen(patt) + 1);
prev_match_pos = match_pos = 0; prev_match_pos = match_pos = 0;
/* note start at pos 1 to skip leading ^ */ /*
for (prev_pos = pos = 1; patt[pos];) * We special-case the syntax '^(...)$' because psql uses it. But beware:
* in BRE mode these parentheses are just ordinary characters. Also,
* sequences beginning "(?" are not what they seem, unless they're "(?:".
* (We should recognize that, too, because of similar_escape().)
*
* Note: it's a bit bogus to be depending on the current regex_flavor
* setting here, because the setting could change before the pattern is
* used. We minimize the risk by trusting the flavor as little as we can,
* but perhaps it would be a good idea to get rid of the "basic" setting.
*/
have_leading_paren = false;
if (patt[pos] == '(' && !is_basic &&
(patt[pos + 1] != '?' || patt[pos + 2] == ':'))
{
have_leading_paren = true;
pos += (patt[pos + 1] != '?' ? 1 : 3);
}
/* Scan remainder of pattern */
prev_pos = pos;
while (patt[pos])
{ {
int len; int len;
/* /*
* Check for characters that indicate multiple possible matches here. * Check for characters that indicate multiple possible matches here.
* XXX I suspect isalpha() is not an adequately locale-sensitive test * Also, drop out at ')' or '$' so the termination test works right.
* for characters that can vary under case folding?
*/ */
if (patt[pos] == '.' || if (patt[pos] == '.' ||
patt[pos] == '(' || patt[pos] == '(' ||
patt[pos] == ')' ||
patt[pos] == '[' || patt[pos] == '[' ||
patt[pos] == '$' || patt[pos] == '^' ||
(case_insensitive && isalpha((unsigned char) patt[pos]))) patt[pos] == '$')
break; break;
/* /*
* In AREs, backslash followed by alphanumeric is an escape, not a * XXX In multibyte character sets, we can't trust isalpha, so assume
* quoted character. Must treat it as having multiple possible * any multibyte char is potentially case-varying.
* matches.
*/ */
if (patt[pos] == '\\' && isalnum((unsigned char) patt[pos + 1])) if (case_insensitive)
break; {
if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
break;
if (isalpha((unsigned char) patt[pos]))
break;
}
/* /*
* Check for quantifiers. Except for +, this means the preceding * Check for quantifiers. Except for +, this means the preceding
* character is optional, so we must remove it from the prefix too! * character is optional, so we must remove it from the prefix too!
* Note: in BREs, \{ is a quantifier.
*/ */
if (patt[pos] == '*' || if (patt[pos] == '*' ||
patt[pos] == '?' || patt[pos] == '?' ||
patt[pos] == '{') patt[pos] == '{' ||
(patt[pos] == '\\' && patt[pos + 1] == '{'))
{ {
match_pos = prev_match_pos; match_pos = prev_match_pos;
pos = prev_pos; pos = prev_pos;
...@@ -4034,9 +4070,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, ...@@ -4034,9 +4070,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
pos = prev_pos; pos = prev_pos;
break; break;
} }
/*
* Normally, backslash quotes the next character. But in AREs,
* backslash followed by alphanumeric is an escape, not a quoted
* character. Must treat it as having multiple possible matches.
* In BREs, \( is a parenthesis, so don't trust that either.
* Note: since only ASCII alphanumerics are escapes, we don't have
* to be paranoid about multibyte here.
*/
if (patt[pos] == '\\') if (patt[pos] == '\\')
{ {
/* backslash quotes the next character */ if (isalnum((unsigned char) patt[pos + 1]) || patt[pos + 1] == '(')
break;
pos++; pos++;
if (patt[pos] == '\0') if (patt[pos] == '\0')
break; break;
...@@ -4054,6 +4100,9 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, ...@@ -4054,6 +4100,9 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
match[match_pos] = '\0'; match[match_pos] = '\0';
rest = &patt[pos]; rest = &patt[pos];
if (have_leading_paren && patt[pos] == ')')
pos++;
if (patt[pos] == '$' && patt[pos + 1] == '\0') if (patt[pos] == '$' && patt[pos + 1] == '\0')
{ {
rest = &patt[pos + 1]; rest = &patt[pos + 1];
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.283 2006/12/30 21:21:55 tgl Exp $ * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.284 2007/01/03 22:39:26 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -477,6 +477,7 @@ extern Datum textregexsubstr(PG_FUNCTION_ARGS); ...@@ -477,6 +477,7 @@ extern Datum textregexsubstr(PG_FUNCTION_ARGS);
extern Datum textregexreplace_noopt(PG_FUNCTION_ARGS); extern Datum textregexreplace_noopt(PG_FUNCTION_ARGS);
extern Datum textregexreplace(PG_FUNCTION_ARGS); extern Datum textregexreplace(PG_FUNCTION_ARGS);
extern Datum similar_escape(PG_FUNCTION_ARGS); extern Datum similar_escape(PG_FUNCTION_ARGS);
extern bool regex_flavor_is_basic(void);
/* regproc.c */ /* regproc.c */
extern Datum regprocin(PG_FUNCTION_ARGS); extern Datum regprocin(PG_FUNCTION_ARGS);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment