Commit 82849df6 authored by Tom Lane's avatar Tom Lane

Add new selectivity estimation functions for pattern-matching operators

(LIKE and regexp matches).  These are not yet referenced in pg_operator,
so by default the system will continue to use eqsel/neqsel.
Also, tweak convert_to_scalar() logic so that common prefixes of strings
are stripped off, allowing better accuracy when all strings in a table
share a common prefix.
parent 8c3b52e7
<!--
$Header: /cvsroot/pgsql/doc/src/sgml/xoper.sgml,v 1.9 2000/03/31 03:27:41 thomas Exp $
$Header: /cvsroot/pgsql/doc/src/sgml/xoper.sgml,v 1.10 2000/04/16 04:41:01 tgl Exp $
-->
<Chapter Id="xoper">
......@@ -254,9 +254,9 @@ SELECT (a + b) AS c FROM test_complex;
<para>
You can frequently get away with using either eqsel or neqsel for
operators that have very high or very low selectivity, even if they
aren't really equality or inequality. For example, the regular expression
matching operators (~, ~*, etc) use eqsel on the assumption that they'll
usually only match a small fraction of the entries in a table.
aren't really equality or inequality. For example, the
approximate-equality geometric operators use eqsel on the assumption that
they'll usually only match a small fraction of the entries in a table.
</para>
<para>
......
......@@ -9,22 +9,20 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/optimizer/path/indxpath.c,v 1.82 2000/04/12 17:15:19 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/optimizer/path/indxpath.c,v 1.83 2000/04/16 04:41:01 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include <ctype.h>
#include <math.h>
#include "postgres.h"
#include <math.h>
#include "access/heapam.h"
#include "access/nbtree.h"
#include "catalog/catname.h"
#include "catalog/pg_amop.h"
#include "catalog/pg_operator.h"
#include "executor/executor.h"
#include "mb/pg_wchar.h"
#include "nodes/makefuncs.h"
#include "nodes/nodeFuncs.h"
#include "optimizer/clauses.h"
......@@ -46,11 +44,6 @@
#define is_indexable_operator(clause,opclass,relam,indexkey_on_left) \
(indexable_operator(clause,opclass,relam,indexkey_on_left) != InvalidOid)
typedef enum
{
Prefix_None, Prefix_Partial, Prefix_Exact
} Prefix_Status;
static void match_index_orclauses(RelOptInfo *rel, IndexOptInfo *index,
List *restrictinfo_list);
static List *match_index_orclause(RelOptInfo *rel, IndexOptInfo *index,
......@@ -92,17 +85,11 @@ static bool function_index_operand(Expr *funcOpnd, RelOptInfo *rel,
IndexOptInfo *index);
static bool match_special_index_operator(Expr *clause, Oid opclass, Oid relam,
bool indexkey_on_left);
static Prefix_Status like_fixed_prefix(char *patt, char **prefix);
static Prefix_Status regex_fixed_prefix(char *patt, bool case_insensitive,
char **prefix);
static List *prefix_quals(Var *leftop, Oid expr_op,
char *prefix, Prefix_Status pstatus);
static char *make_greater_string(const char *str, Oid datatype);
char *prefix, Pattern_Prefix_Status pstatus);
static Oid find_operator(const char *opname, Oid datatype);
static Datum string_to_datum(const char *str, Oid datatype);
static Const *string_to_const(const char *str, Oid datatype);
static bool string_lessthan(const char *str1, const char *str2,
Oid datatype);
/*
......@@ -1644,6 +1631,7 @@ match_special_index_operator(Expr *clause, Oid opclass, Oid relam,
Datum constvalue;
char *patt;
char *prefix;
char *rest;
/*
* Currently, all known special operators require the indexkey on the
......@@ -1672,7 +1660,8 @@ match_special_index_operator(Expr *clause, Oid opclass, Oid relam,
case OID_NAME_LIKE_OP:
/* the right-hand const is type text for all of these */
patt = textout((text *) DatumGetPointer(constvalue));
isIndexable = like_fixed_prefix(patt, &prefix) != Prefix_None;
isIndexable = pattern_fixed_prefix(patt, Pattern_Type_Like,
&prefix, &rest) != Pattern_Prefix_None;
if (prefix)
pfree(prefix);
pfree(patt);
......@@ -1684,7 +1673,8 @@ match_special_index_operator(Expr *clause, Oid opclass, Oid relam,
case OID_NAME_REGEXEQ_OP:
/* the right-hand const is type text for all of these */
patt = textout((text *) DatumGetPointer(constvalue));
isIndexable = regex_fixed_prefix(patt, false, &prefix) != Prefix_None;
isIndexable = pattern_fixed_prefix(patt, Pattern_Type_Regex,
&prefix, &rest) != Pattern_Prefix_None;
if (prefix)
pfree(prefix);
pfree(patt);
......@@ -1696,7 +1686,8 @@ match_special_index_operator(Expr *clause, Oid opclass, Oid relam,
case OID_NAME_ICREGEXEQ_OP:
/* the right-hand const is type text for all of these */
patt = textout((text *) DatumGetPointer(constvalue));
isIndexable = regex_fixed_prefix(patt, true, &prefix) != Prefix_None;
isIndexable = pattern_fixed_prefix(patt, Pattern_Type_Regex_IC,
&prefix, &rest) != Pattern_Prefix_None;
if (prefix)
pfree(prefix);
pfree(patt);
......@@ -1776,7 +1767,8 @@ expand_indexqual_conditions(List *indexquals)
Datum constvalue;
char *patt;
char *prefix;
Prefix_Status pstatus;
char *rest;
Pattern_Prefix_Status pstatus;
switch (expr_op)
{
......@@ -1794,7 +1786,8 @@ expand_indexqual_conditions(List *indexquals)
/* the right-hand const is type text for all of these */
constvalue = ((Const *) rightop)->constvalue;
patt = textout((text *) DatumGetPointer(constvalue));
pstatus = like_fixed_prefix(patt, &prefix);
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Like,
&prefix, &rest);
resultquals = nconc(resultquals,
prefix_quals(leftop, expr_op,
prefix, pstatus));
......@@ -1810,7 +1803,8 @@ expand_indexqual_conditions(List *indexquals)
/* the right-hand const is type text for all of these */
constvalue = ((Const *) rightop)->constvalue;
patt = textout((text *) DatumGetPointer(constvalue));
pstatus = regex_fixed_prefix(patt, false, &prefix);
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex,
&prefix, &rest);
resultquals = nconc(resultquals,
prefix_quals(leftop, expr_op,
prefix, pstatus));
......@@ -1826,7 +1820,8 @@ expand_indexqual_conditions(List *indexquals)
/* the right-hand const is type text for all of these */
constvalue = ((Const *) rightop)->constvalue;
patt = textout((text *) DatumGetPointer(constvalue));
pstatus = regex_fixed_prefix(patt, true, &prefix);
pstatus = pattern_fixed_prefix(patt, Pattern_Type_Regex_IC,
&prefix, &rest);
resultquals = nconc(resultquals,
prefix_quals(leftop, expr_op,
prefix, pstatus));
......@@ -1844,130 +1839,6 @@ expand_indexqual_conditions(List *indexquals)
return resultquals;
}
/*
* Extract the fixed prefix, if any, for a LIKE pattern.
* *prefix is set to a palloc'd prefix string,
* or to NULL if no fixed prefix exists for the pattern.
* The return value distinguishes no fixed prefix, a partial prefix,
* or an exact-match-only pattern.
*/
static Prefix_Status
like_fixed_prefix(char *patt, char **prefix)
{
char *match;
int pos,
match_pos;
*prefix = match = palloc(strlen(patt) + 1);
match_pos = 0;
for (pos = 0; patt[pos]; pos++)
{
/* % and _ are wildcard characters in LIKE */
if (patt[pos] == '%' ||
patt[pos] == '_')
break;
/* Backslash quotes the next character */
if (patt[pos] == '\\')
{
pos++;
if (patt[pos] == '\0')
break;
}
/*
* NOTE: this code used to think that %% meant a literal %, but
* textlike() itself does not think that, and the SQL92 spec
* doesn't say any such thing either.
*/
match[match_pos++] = patt[pos];
}
match[match_pos] = '\0';
/* in LIKE, an empty pattern is an exact match! */
if (patt[pos] == '\0')
return Prefix_Exact; /* reached end of pattern, so exact */
if (match_pos > 0)
return Prefix_Partial;
return Prefix_None;
}
/*
* Extract the fixed prefix, if any, for a regex pattern.
* *prefix is set to a palloc'd prefix string,
* or to NULL if no fixed prefix exists for the pattern.
* The return value distinguishes no fixed prefix, a partial prefix,
* or an exact-match-only pattern.
*/
static Prefix_Status
regex_fixed_prefix(char *patt, bool case_insensitive,
char **prefix)
{
char *match;
int pos,
match_pos;
*prefix = NULL;
/* Pattern must be anchored left */
if (patt[0] != '^')
return Prefix_None;
/* Cannot optimize if unquoted | { } is present in pattern */
for (pos = 1; patt[pos]; pos++)
{
if (patt[pos] == '|' ||
patt[pos] == '{' ||
patt[pos] == '}')
return Prefix_None;
if (patt[pos] == '\\')
{
pos++;
if (patt[pos] == '\0')
break;
}
}
/* OK, allocate space for pattern */
*prefix = match = palloc(strlen(patt) + 1);
match_pos = 0;
/* note start at pos 1 to skip leading ^ */
for (pos = 1; patt[pos]; pos++)
{
if (patt[pos] == '.' ||
patt[pos] == '?' ||
patt[pos] == '*' ||
patt[pos] == '[' ||
patt[pos] == '$' ||
/*
* XXX I suspect isalpha() is not an adequately locale-sensitive
* test for characters that can vary under case folding?
*/
(case_insensitive && isalpha(patt[pos])))
break;
if (patt[pos] == '\\')
{
pos++;
if (patt[pos] == '\0')
break;
}
match[match_pos++] = patt[pos];
}
match[match_pos] = '\0';
if (patt[pos] == '$' && patt[pos + 1] == '\0')
return Prefix_Exact; /* pattern specifies exact match */
if (match_pos > 0)
return Prefix_Partial;
return Prefix_None;
}
/*
* Given a fixed prefix that all the "leftop" values must have,
* generate suitable indexqual condition(s). expr_op is the original
......@@ -1976,7 +1847,7 @@ regex_fixed_prefix(char *patt, bool case_insensitive,
*/
static List *
prefix_quals(Var *leftop, Oid expr_op,
char *prefix, Prefix_Status pstatus)
char *prefix, Pattern_Prefix_Status pstatus)
{
List *result;
Oid datatype;
......@@ -1986,7 +1857,7 @@ prefix_quals(Var *leftop, Oid expr_op,
Expr *expr;
char *greaterstr;
Assert(pstatus != Prefix_None);
Assert(pstatus != Pattern_Prefix_None);
switch (expr_op)
{
......@@ -2022,7 +1893,7 @@ prefix_quals(Var *leftop, Oid expr_op,
/*
* If we found an exact-match pattern, generate an "=" indexqual.
*/
if (pstatus == Prefix_Exact)
if (pstatus == Pattern_Prefix_Exact)
{
oproid = find_operator("=", datatype);
if (oproid == InvalidOid)
......@@ -2067,68 +1938,6 @@ prefix_quals(Var *leftop, Oid expr_op,
return result;
}
/*
* Try to generate a string greater than the given string or any string it is
* a prefix of. If successful, return a palloc'd string; else return NULL.
*
* To work correctly in non-ASCII locales with weird collation orders,
* we cannot simply increment "foo" to "fop" --- we have to check whether
* we actually produced a string greater than the given one. If not,
* increment the righthand byte again and repeat. If we max out the righthand
* byte, truncate off the last character and start incrementing the next.
* For example, if "z" were the last character in the sort order, then we
* could produce "foo" as a string greater than "fonz".
*
* This could be rather slow in the worst case, but in most cases we won't
* have to try more than one or two strings before succeeding.
*
* XXX in a sufficiently weird locale, this might produce incorrect results?
* For example, in German I believe "ss" is treated specially --- if we are
* given "foos" and return "foot", will this actually be greater than "fooss"?
*/
static char *
make_greater_string(const char *str, Oid datatype)
{
char *workstr;
int len;
/*
* Make a modifiable copy, which will be our return value if
* successful
*/
workstr = pstrdup((char *) str);
while ((len = strlen(workstr)) > 0)
{
unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
/*
* Try to generate a larger string by incrementing the last byte.
*/
while (*lastchar < (unsigned char) 255)
{
(*lastchar)++;
if (string_lessthan(str, workstr, datatype))
return workstr; /* Success! */
}
/*
* Truncate off the last character, which might be more than 1
* byte in MULTIBYTE case.
*/
#ifdef MULTIBYTE
len = pg_mbcliplen((const unsigned char *) workstr, len, len - 1);
workstr[len] = '\0';
#else
*lastchar = '\0';
#endif
}
/* Failed... */
pfree(workstr);
return NULL;
}
/*
* Handy subroutines for match_special_index_operator() and friends.
*/
......@@ -2179,45 +1988,3 @@ string_to_const(const char *str, Oid datatype)
return makeConst(datatype, ((datatype == NAMEOID) ? NAMEDATALEN : -1),
conval, false, false, false, false);
}
/*
* Test whether two strings are "<" according to the rules of the given
* datatype. We do this the hard way, ie, actually calling the type's
* "<" operator function, to ensure we get the right result...
*/
static bool
string_lessthan(const char *str1, const char *str2, Oid datatype)
{
Datum datum1 = string_to_datum(str1, datatype);
Datum datum2 = string_to_datum(str2, datatype);
bool result;
switch (datatype)
{
case TEXTOID:
result = text_lt((text *) datum1, (text *) datum2);
break;
case BPCHAROID:
result = bpcharlt((char *) datum1, (char *) datum2);
break;
case VARCHAROID:
result = varcharlt((char *) datum1, (char *) datum2);
break;
case NAMEOID:
result = namelt((NameData *) datum1, (NameData *) datum2);
break;
default:
elog(ERROR, "string_lessthan: unexpected datatype %u", datatype);
result = false;
break;
}
pfree(DatumGetPointer(datum1));
pfree(DatumGetPointer(datum2));
return result;
}
......@@ -15,13 +15,14 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.64 2000/04/12 17:15:51 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.65 2000/04/16 04:41:02 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <ctype.h>
#include <math.h>
#include "access/heapam.h"
......@@ -30,6 +31,7 @@
#include "catalog/pg_proc.h"
#include "catalog/pg_statistic.h"
#include "catalog/pg_type.h"
#include "mb/pg_wchar.h"
#include "optimizer/cost.h"
#include "parser/parse_func.h"
#include "parser/parse_oper.h"
......@@ -50,8 +52,23 @@
/* default selectivity estimate for inequalities such as "A < b" */
#define DEFAULT_INEQ_SEL (1.0 / 3.0)
static bool convert_string_to_scalar(char *str, int strlength,
double *scaleval);
/* default selectivity estimate for pattern-match operators such as LIKE */
#define DEFAULT_MATCH_SEL 0.01
static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
Datum lobound, Datum hibound, Oid boundstypid,
double *scaledlobound, double *scaledhibound);
static double convert_numeric_to_scalar(Datum value, Oid typid);
static void convert_string_to_scalar(unsigned char *value,
double *scaledvalue,
unsigned char *lobound,
double *scaledlobound,
unsigned char *hibound,
double *scaledhibound);
static double convert_one_string_to_scalar(unsigned char *value,
int rangelo, int rangehi);
static unsigned char * convert_string_datum(Datum value, Oid typid);
static double convert_timevalue_to_scalar(Datum value, Oid typid);
static void getattproperties(Oid relid, AttrNumber attnum,
Oid *typid,
int *typlen,
......@@ -64,6 +81,15 @@ static bool getattstatistics(Oid relid, AttrNumber attnum,
Datum *commonval,
Datum *loval,
Datum *hival);
static Selectivity prefix_selectivity(char *prefix,
Oid relid,
AttrNumber attno,
Oid datatype);
static Selectivity pattern_selectivity(char *patt, Pattern_Type ptype);
static bool string_lessthan(const char *str1, const char *str2,
Oid datatype);
static Oid find_operator(const char *opname, Oid datatype);
static Datum string_to_datum(const char *str, Oid datatype);
/*
......@@ -71,9 +97,10 @@ static bool getattstatistics(Oid relid, AttrNumber attnum,
*
* Note: this routine is also used to estimate selectivity for some
* operators that are not "=" but have comparable selectivity behavior,
* such as "~~" (text LIKE). Even for "=" we must keep in mind that
* the left and right datatypes may differ, so the type of the given
* constant "value" may be different from the type of the attribute.
* such as "~=" (geometric approximate-match). Even for "=", we must
* keep in mind that the left and right datatypes may differ, so the type
* of the given constant "value" may be different from the type of the
* attribute.
*/
float64
eqsel(Oid opid,
......@@ -255,7 +282,8 @@ scalarltsel(Oid opid,
{
HeapTuple oprtuple;
Oid ltype,
rtype;
rtype,
contype;
Oid typid;
int typlen;
bool typbyval;
......@@ -277,23 +305,7 @@ scalarltsel(Oid opid,
elog(ERROR, "scalarltsel: no tuple for operator %u", opid);
ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft;
rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright;
/* Convert the constant to a uniform comparison scale. */
if (!convert_to_scalar(value,
((flag & SEL_RIGHT) ? rtype : ltype),
&val))
{
/*
* Ideally we'd produce an error here, on the grounds that the
* given operator shouldn't have scalarltsel registered as its
* selectivity func unless we can deal with its operand types.
* But currently, all manner of stuff is invoking scalarltsel,
* so give a default estimate until that can be fixed.
*/
*result = DEFAULT_INEQ_SEL;
return result;
}
contype = (flag & SEL_RIGHT) ? rtype : ltype;
/* Now get info and stats about the attribute */
getattproperties(relid, attno,
......@@ -308,17 +320,24 @@ scalarltsel(Oid opid,
return result;
}
/* Convert the attribute's loval/hival to common scale. */
if (!convert_to_scalar(loval, typid, &low) ||
!convert_to_scalar(hival, typid, &high))
/* Convert the values to a uniform comparison scale. */
if (!convert_to_scalar(value, contype, &val,
loval, hival, typid,
&low, &high))
{
/* See above comments... */
/*
* Ideally we'd produce an error here, on the grounds that the
* given operator shouldn't have scalarltsel registered as its
* selectivity func unless we can deal with its operand types.
* But currently, all manner of stuff is invoking scalarltsel,
* so give a default estimate until that can be fixed.
*/
if (!typbyval)
{
pfree(DatumGetPointer(hival));
pfree(DatumGetPointer(loval));
}
*result = DEFAULT_INEQ_SEL;
return result;
}
......@@ -391,6 +410,183 @@ scalargtsel(Oid opid,
return result;
}
/*
* patternsel - Generic code for pattern-match selectivity.
*/
static float64
patternsel(Oid opid,
Pattern_Type ptype,
Oid relid,
AttrNumber attno,
Datum value,
int32 flag)
{
float64 result;
result = (float64) palloc(sizeof(float64data));
/* Must have a constant for the pattern, or cannot learn anything */
if ((flag & (SEL_CONSTANT | SEL_RIGHT)) != (SEL_CONSTANT | SEL_RIGHT))
*result = DEFAULT_MATCH_SEL;
else
{
HeapTuple oprtuple;
Oid ltype,
rtype;
char *patt;
Pattern_Prefix_Status pstatus;
char *prefix;
char *rest;
/*
* Get left and right datatypes of the operator so we know what
* type the attribute is.
*/
oprtuple = get_operator_tuple(opid);
if (!HeapTupleIsValid(oprtuple))
elog(ERROR, "patternsel: no tuple for operator %u", opid);
ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft;
rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright;
/* the right-hand const is type text for all supported operators */
Assert(rtype == TEXTOID);
patt = textout((text *) DatumGetPointer(value));
/* divide pattern into fixed prefix and remainder */
pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest);
if (pstatus == Pattern_Prefix_Exact)
{
/* Pattern specifies an exact match, so pretend operator is '=' */
Oid eqopr = find_operator("=", ltype);
Datum eqcon;
if (eqopr == InvalidOid)
elog(ERROR, "patternsel: no = operator for type %u", ltype);
eqcon = string_to_datum(prefix, ltype);
result = eqsel(eqopr, relid, attno, eqcon, SEL_CONSTANT|SEL_RIGHT);
pfree(DatumGetPointer(eqcon));
}
else
{
/*
* Not exact-match pattern. We estimate selectivity of the
* fixed prefix and remainder of pattern separately, then
* combine the two.
*/
Selectivity prefixsel;
Selectivity restsel;
Selectivity selec;
if (pstatus == Pattern_Prefix_Partial)
prefixsel = prefix_selectivity(prefix, relid, attno, ltype);
else
prefixsel = 1.0;
restsel = pattern_selectivity(rest, ptype);
selec = prefixsel * restsel;
/* result should be in range, but make sure... */
if (selec < 0.0)
selec = 0.0;
else if (selec > 1.0)
selec = 1.0;
*result = (float64data) selec;
}
if (prefix)
pfree(prefix);
pfree(patt);
}
return result;
}
/*
* regexeqsel - Selectivity of regular-expression pattern match.
*/
float64
regexeqsel(Oid opid,
Oid relid,
AttrNumber attno,
Datum value,
int32 flag)
{
return patternsel(opid, Pattern_Type_Regex, relid, attno, value, flag);
}
/*
* icregexeqsel - Selectivity of case-insensitive regex match.
*/
float64
icregexeqsel(Oid opid,
Oid relid,
AttrNumber attno,
Datum value,
int32 flag)
{
return patternsel(opid, Pattern_Type_Regex_IC, relid, attno, value, flag);
}
/*
* likesel - Selectivity of LIKE pattern match.
*/
float64
likesel(Oid opid,
Oid relid,
AttrNumber attno,
Datum value,
int32 flag)
{
return patternsel(opid, Pattern_Type_Like, relid, attno, value, flag);
}
/*
* regexnesel - Selectivity of regular-expression pattern non-match.
*/
float64
regexnesel(Oid opid,
Oid relid,
AttrNumber attno,
Datum value,
int32 flag)
{
float64 result;
result = patternsel(opid, Pattern_Type_Regex, relid, attno, value, flag);
*result = 1.0 - *result;
return result;
}
/*
* icregexnesel - Selectivity of case-insensitive regex non-match.
*/
float64
icregexnesel(Oid opid,
Oid relid,
AttrNumber attno,
Datum value,
int32 flag)
{
float64 result;
result = patternsel(opid, Pattern_Type_Regex_IC, relid, attno, value, flag);
*result = 1.0 - *result;
return result;
}
/*
* nlikesel - Selectivity of LIKE pattern non-match.
*/
float64
nlikesel(Oid opid,
Oid relid,
AttrNumber attno,
Datum value,
int32 flag)
{
float64 result;
result = patternsel(opid, Pattern_Type_Like, relid, attno, value, flag);
*result = 1.0 - *result;
return result;
}
/*
* eqjoinsel - Join selectivity of "="
*/
......@@ -491,9 +687,112 @@ scalargtjoinsel(Oid opid,
return result;
}
/*
* regexeqjoinsel - Join selectivity of regular-expression pattern match.
*/
float64
regexeqjoinsel(Oid opid,
Oid relid1,
AttrNumber attno1,
Oid relid2,
AttrNumber attno2)
{
float64 result;
result = (float64) palloc(sizeof(float64data));
*result = DEFAULT_MATCH_SEL;
return result;
}
/*
* icregexeqjoinsel - Join selectivity of case-insensitive regex match.
*/
float64
icregexeqjoinsel(Oid opid,
Oid relid1,
AttrNumber attno1,
Oid relid2,
AttrNumber attno2)
{
float64 result;
result = (float64) palloc(sizeof(float64data));
*result = DEFAULT_MATCH_SEL;
return result;
}
/*
* likejoinsel - Join selectivity of LIKE pattern match.
*/
float64
likejoinsel(Oid opid,
Oid relid1,
AttrNumber attno1,
Oid relid2,
AttrNumber attno2)
{
float64 result;
result = (float64) palloc(sizeof(float64data));
*result = DEFAULT_MATCH_SEL;
return result;
}
/*
* regexnejoinsel - Join selectivity of regex non-match.
*/
float64
regexnejoinsel(Oid opid,
Oid relid1,
AttrNumber attno1,
Oid relid2,
AttrNumber attno2)
{
float64 result;
result = regexeqjoinsel(opid, relid1, attno1, relid2, attno2);
*result = 1.0 - *result;
return result;
}
/*
* icregexnejoinsel - Join selectivity of case-insensitive regex non-match.
*/
float64
icregexnejoinsel(Oid opid,
Oid relid1,
AttrNumber attno1,
Oid relid2,
AttrNumber attno2)
{
float64 result;
result = icregexeqjoinsel(opid, relid1, attno1, relid2, attno2);
*result = 1.0 - *result;
return result;
}
/*
* nlikejoinsel - Join selectivity of LIKE pattern non-match.
*/
float64
nlikejoinsel(Oid opid,
Oid relid1,
AttrNumber attno1,
Oid relid2,
AttrNumber attno2)
{
float64 result;
result = likejoinsel(opid, relid1, attno1, relid2, attno2);
*result = 1.0 - *result;
return result;
}
/*
* convert_to_scalar
* Convert a non-NULL value of the indicated type to the comparison
* Convert non-NULL values of the indicated types to the comparison
* scale needed by scalarltsel()/scalargtsel().
* Returns "true" if successful.
*
......@@ -501,7 +800,8 @@ scalargtjoinsel(Oid opid,
* "double" values.
*
* String datatypes are converted by convert_string_to_scalar(),
* which is explained below.
* which is explained below. The reason why this routine deals with
* three values at a time, not just one, is that we need it for strings.
*
* The several datatypes representing absolute times are all converted
* to Timestamp, which is actually a double, and then we just use that
......@@ -511,237 +811,349 @@ scalargtjoinsel(Oid opid,
* The several datatypes representing relative times (intervals) are all
* converted to measurements expressed in seconds.
*/
bool
convert_to_scalar(Datum value, Oid typid,
double *scaleval)
static bool
convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
Datum lobound, Datum hibound, Oid boundstypid,
double *scaledlobound, double *scaledhibound)
{
switch (typid)
switch (valuetypid)
{
/*
* Built-in numeric types
*/
case BOOLOID:
*scaleval = (double) DatumGetUInt8(value);
return true;
/*
* Built-in numeric types
*/
case BOOLOID:
case INT2OID:
*scaleval = (double) DatumGetInt16(value);
return true;
case INT4OID:
*scaleval = (double) DatumGetInt32(value);
return true;
case INT8OID:
*scaleval = (double) (*i8tod((int64 *) DatumGetPointer(value)));
return true;
case FLOAT4OID:
*scaleval = (double) (*DatumGetFloat32(value));
return true;
case FLOAT8OID:
*scaleval = (double) (*DatumGetFloat64(value));
return true;
case NUMERICOID:
*scaleval = (double) (*numeric_float8((Numeric) DatumGetPointer(value)));
return true;
case OIDOID:
case REGPROCOID:
/* we can treat OIDs as integers... */
*scaleval = (double) DatumGetObjectId(value);
*scaledvalue = convert_numeric_to_scalar(value, valuetypid);
*scaledlobound = convert_numeric_to_scalar(lobound, boundstypid);
*scaledhibound = convert_numeric_to_scalar(hibound, boundstypid);
return true;
/*
* Built-in string types
*/
/*
* Built-in string types
*/
case CHAROID:
{
char ch = DatumGetChar(value);
return convert_string_to_scalar(&ch, 1, scaleval);
}
case BPCHAROID:
case VARCHAROID:
case TEXTOID:
{
char *str = (char *) VARDATA(DatumGetPointer(value));
int strlength = VARSIZE(DatumGetPointer(value)) - VARHDRSZ;
return convert_string_to_scalar(str, strlength, scaleval);
}
case NAMEOID:
{
NameData *nm = (NameData *) DatumGetPointer(value);
return convert_string_to_scalar(NameStr(*nm), strlen(NameStr(*nm)),
scaleval);
}
{
unsigned char *valstr = convert_string_datum(value, valuetypid);
unsigned char *lostr = convert_string_datum(lobound, boundstypid);
unsigned char *histr = convert_string_datum(hibound, boundstypid);
convert_string_to_scalar(valstr, scaledvalue,
lostr, scaledlobound,
histr, scaledhibound);
pfree(valstr);
pfree(lostr);
pfree(histr);
return true;
}
/*
* Built-in absolute-time types
*/
/*
* Built-in time types
*/
case TIMESTAMPOID:
*scaleval = *((Timestamp *) DatumGetPointer(value));
return true;
case ABSTIMEOID:
*scaleval = *abstime_timestamp(value);
return true;
case DATEOID:
*scaleval = *date_timestamp(value);
return true;
/*
* Built-in relative-time types
*/
case INTERVALOID:
{
Interval *interval = (Interval *) DatumGetPointer(value);
/*
* Convert the month part of Interval to days using
* assumed average month length of 365.25/12.0 days. Not
* too accurate, but plenty good enough for our purposes.
*/
*scaleval = interval->time +
interval->month * (365.25 / 12.0 * 24.0 * 60.0 * 60.0);
return true;
}
case RELTIMEOID:
*scaleval = (RelativeTime) DatumGetInt32(value);
return true;
case TINTERVALOID:
{
TimeInterval interval = (TimeInterval) DatumGetPointer(value);
if (interval->status != 0)
{
*scaleval = interval->data[1] - interval->data[0];
return true;
}
break;
}
case TIMEOID:
*scaleval = *((TimeADT *) DatumGetPointer(value));
*scaledvalue = convert_timevalue_to_scalar(value, valuetypid);
*scaledlobound = convert_timevalue_to_scalar(lobound, boundstypid);
*scaledhibound = convert_timevalue_to_scalar(hibound, boundstypid);
return true;
default:
{
/*
* See whether there is a registered type-conversion
* function, namely a procedure named "float8" with the
* right signature. If so, assume we can convert the value
* to the numeric scale.
*
* NOTE: there are no such procedures in the standard
* distribution, except with argument types that we
* already dealt with above. This code is just here as an
* escape for user-defined types.
*/
Oid oid_array[FUNC_MAX_ARGS];
HeapTuple ftup;
MemSet(oid_array, 0, FUNC_MAX_ARGS * sizeof(Oid));
oid_array[0] = typid;
ftup = SearchSysCacheTuple(PROCNAME,
PointerGetDatum("float8"),
Int32GetDatum(1),
PointerGetDatum(oid_array),
0);
if (HeapTupleIsValid(ftup) &&
((Form_pg_proc) GETSTRUCT(ftup))->prorettype == FLOAT8OID)
{
RegProcedure convertproc = (RegProcedure) ftup->t_data->t_oid;
Datum converted = (Datum) fmgr(convertproc, value);
*scaleval = (double) (*DatumGetFloat64(converted));
return true;
}
break;
}
}
/* Don't know how to convert */
return false;
}
/*
* Do convert_to_scalar()'s work for any numeric data type.
*/
static double
convert_numeric_to_scalar(Datum value, Oid typid)
{
switch (typid)
{
case BOOLOID:
return (double) DatumGetUInt8(value);
case INT2OID:
return (double) DatumGetInt16(value);
case INT4OID:
return (double) DatumGetInt32(value);
case INT8OID:
return (double) (*i8tod((int64 *) DatumGetPointer(value)));
case FLOAT4OID:
return (double) (*DatumGetFloat32(value));
case FLOAT8OID:
return (double) (*DatumGetFloat64(value));
case NUMERICOID:
return (double) (*numeric_float8((Numeric) DatumGetPointer(value)));
case OIDOID:
case REGPROCOID:
/* we can treat OIDs as integers... */
return (double) DatumGetObjectId(value);
}
/* Can't get here unless someone tries to use scalarltsel/scalargtsel
* on an operator with one numeric and one non-numeric operand.
*/
elog(ERROR, "convert_numeric_to_scalar: unsupported type %u", typid);
return 0;
}
/*
* Do convert_to_scalar()'s work for any character-string data type.
*
* String datatypes are converted to a scale that ranges from 0 to 1, where
* we visualize the bytes of the string as fractional base-256 digits.
* It's sufficient to consider the first few bytes, since double has only
* limited precision (and we can't expect huge accuracy in our selectivity
* predictions anyway!)
* String datatypes are converted to a scale that ranges from 0 to 1,
* where we visualize the bytes of the string as fractional digits.
*
* If USE_LOCALE is defined, we must pass the string through strxfrm()
* before doing the computation, so as to generate correct locale-specific
* results.
* We do not want the base to be 256, however, since that tends to
* generate inflated selectivity estimates; few databases will have
* occurrences of all 256 possible byte values at each position.
* Instead, use the smallest and largest byte values seen in the bounds
* as the estimated range for each byte, after some fudging to deal with
* the fact that we probably aren't going to see the full range that way.
*
* An additional refinement is that we discard any common prefix of the
* three strings before computing the scaled values. This allows us to
* "zoom in" when we encounter a narrow data range. An example is a phone
* number database where all the values begin with the same area code.
*/
static bool
convert_string_to_scalar(char *str, int strlength,
double *scaleval)
static void
convert_string_to_scalar(unsigned char *value,
double *scaledvalue,
unsigned char *lobound,
double *scaledlobound,
unsigned char *hibound,
double *scaledhibound)
{
int rangelo,
rangehi;
unsigned char *sptr;
int slen;
rangelo = rangehi = hibound[0];
for (sptr = lobound; *sptr; sptr++)
{
if (rangelo > *sptr)
rangelo = *sptr;
if (rangehi < *sptr)
rangehi = *sptr;
}
for (sptr = hibound; *sptr; sptr++)
{
if (rangelo > *sptr)
rangelo = *sptr;
if (rangehi < *sptr)
rangehi = *sptr;
}
/* If range includes any upper-case ASCII chars, make it include all */
if (rangelo <= 'Z' && rangehi >= 'A')
{
if (rangelo > 'A')
rangelo = 'A';
if (rangehi < 'Z')
rangehi = 'Z';
}
/* Ditto lower-case */
if (rangelo <= 'z' && rangehi >= 'a')
{
if (rangelo > 'a')
rangelo = 'a';
if (rangehi < 'z')
rangehi = 'z';
}
/* Ditto digits */
if (rangelo <= '9' && rangehi >= '0')
{
if (rangelo > '0')
rangelo = '0';
if (rangehi < '9')
rangehi = '9';
}
/* If range includes less than 10 chars, assume we have not got enough
* data, and make it include regular ASCII set.
*/
if (rangehi - rangelo < 9)
{
rangelo = ' ';
rangehi = 127;
}
/*
* Now strip any common prefix of the three strings.
*/
while (*lobound)
{
if (*lobound != *hibound || *lobound != *value)
break;
lobound++, hibound++, value++;
}
/*
* Now we can do the conversions.
*/
*scaledvalue = convert_one_string_to_scalar(value, rangelo, rangehi);
*scaledlobound = convert_one_string_to_scalar(lobound, rangelo, rangehi);
*scaledhibound = convert_one_string_to_scalar(hibound, rangelo, rangehi);
}
static double
convert_one_string_to_scalar(unsigned char *value, int rangelo, int rangehi)
{
int slen = strlen((char *) value);
double num,
denom,
base;
if (slen <= 0)
return 0.0; /* empty string has scalar value 0 */
/* Since base is at least 10, need not consider more than about 20 chars */
if (slen > 20)
slen = 20;
/* Convert initial characters to fraction */
base = rangehi - rangelo + 1;
num = 0.0;
denom = base;
while (slen-- > 0)
{
int ch = *value++;
if (ch < rangelo)
ch = rangelo-1;
else if (ch > rangehi)
ch = rangehi+1;
num += ((double) (ch - rangelo)) / denom;
denom *= base;
}
return num;
}
/*
* Convert a string-type Datum into a palloc'd, null-terminated string.
*
* If USE_LOCALE is defined, we must pass the string through strxfrm()
* before continuing, so as to generate correct locale-specific results.
*/
static unsigned char *
convert_string_datum(Datum value, Oid typid)
{
char *val;
#ifdef USE_LOCALE
char *rawstr;
char *xfrmstr;
size_t xfrmsize;
size_t xfrmlen;
#endif
double num,
denom;
if (strlength <= 0)
switch (typid)
{
*scaleval = 0; /* empty string has scalar value 0 */
return true;
case CHAROID:
val = (char *) palloc(2);
val[0] = DatumGetChar(value);
val[1] = '\0';
break;
case BPCHAROID:
case VARCHAROID:
case TEXTOID:
{
char *str = (char *) VARDATA(DatumGetPointer(value));
int strlength = VARSIZE(DatumGetPointer(value)) - VARHDRSZ;
val = (char *) palloc(strlength+1);
memcpy(val, str, strlength);
val[strlength] = '\0';
break;
}
case NAMEOID:
{
NameData *nm = (NameData *) DatumGetPointer(value);
val = pstrdup(NameStr(*nm));
break;
}
default:
/* Can't get here unless someone tries to use scalarltsel
* on an operator with one string and one non-string operand.
*/
elog(ERROR, "convert_string_datum: unsupported type %u", typid);
return NULL;
}
#ifdef USE_LOCALE
/* Need a null-terminated string to pass to strxfrm() */
rawstr = (char *) palloc(strlength + 1);
memcpy(rawstr, str, strlength);
rawstr[strlength] = '\0';
/* Guess that transformed string is not much bigger */
xfrmsize = strlength + 32; /* arbitrary pad value here... */
/* Guess that transformed string is not much bigger than original */
xfrmsize = strlen(val) + 32; /* arbitrary pad value here... */
xfrmstr = (char *) palloc(xfrmsize);
xfrmlen = strxfrm(xfrmstr, rawstr, xfrmsize);
xfrmlen = strxfrm(xfrmstr, val, xfrmsize);
if (xfrmlen >= xfrmsize)
{
/* Oops, didn't make it */
pfree(xfrmstr);
xfrmstr = (char *) palloc(xfrmlen + 1);
xfrmlen = strxfrm(xfrmstr, rawstr, xfrmlen + 1);
xfrmlen = strxfrm(xfrmstr, val, xfrmlen + 1);
}
pfree(rawstr);
sptr = (unsigned char *) xfrmstr;
slen = xfrmlen;
#else
sptr = (unsigned char *) str;
slen = strlength;
pfree(val);
val = xfrmstr;
#endif
/* No need to consider more than about 8 bytes (sizeof double) */
if (slen > 8)
slen = 8;
return (unsigned char *) val;
}
/* Convert initial characters to fraction */
num = 0.0;
denom = 256.0;
while (slen-- > 0)
/*
* Do convert_to_scalar()'s work for any timevalue data type.
*/
static double
convert_timevalue_to_scalar(Datum value, Oid typid)
{
switch (typid)
{
num += ((double) (*sptr++)) / denom;
denom *= 256.0;
}
case TIMESTAMPOID:
return *((Timestamp *) DatumGetPointer(value));
case ABSTIMEOID:
return *abstime_timestamp(value);
case DATEOID:
return *date_timestamp(value);
case INTERVALOID:
{
Interval *interval = (Interval *) DatumGetPointer(value);
#ifdef USE_LOCALE
pfree(xfrmstr);
#endif
/*
* Convert the month part of Interval to days using
* assumed average month length of 365.25/12.0 days. Not
* too accurate, but plenty good enough for our purposes.
*/
return interval->time +
interval->month * (365.25 / 12.0 * 24.0 * 60.0 * 60.0);
}
case RELTIMEOID:
return (RelativeTime) DatumGetInt32(value);
case TINTERVALOID:
{
TimeInterval interval = (TimeInterval) DatumGetPointer(value);
*scaleval = num;
return true;
if (interval->status != 0)
return interval->data[1] - interval->data[0];
return 0; /* for lack of a better idea */
}
case TIMEOID:
return *((TimeADT *) DatumGetPointer(value));
}
/* Can't get here unless someone tries to use scalarltsel/scalargtsel
* on an operator with one timevalue and one non-timevalue operand.
*/
elog(ERROR, "convert_timevalue_to_scalar: unsupported type %u", typid);
return 0;
}
......@@ -914,6 +1326,623 @@ getattstatistics(Oid relid,
return true;
}
/*-------------------------------------------------------------------------
*
* Pattern analysis functions
*
* These routines support analysis of LIKE and regular-expression patterns
* by the planner/optimizer. It's important that they agree with the
* regular-expression code in backend/regex/ and the LIKE code in
* backend/utils/adt/like.c.
*
* Note that the prefix-analysis functions are called from
* backend/optimizer/path/indxpath.c as well as from routines in this file.
*
*-------------------------------------------------------------------------
*/
/*
* Extract the fixed prefix, if any, for a pattern.
* *prefix is set to a palloc'd prefix string,
* or to NULL if no fixed prefix exists for the pattern.
* *rest is set to point to the remainder of the pattern after the
* portion describing the fixed prefix.
* The return value distinguishes no fixed prefix, a partial prefix,
* or an exact-match-only pattern.
*/
static Pattern_Prefix_Status
like_fixed_prefix(char *patt, char **prefix, char **rest)
{
char *match;
int pos,
match_pos;
*prefix = match = palloc(strlen(patt) + 1);
match_pos = 0;
for (pos = 0; patt[pos]; pos++)
{
/* % and _ are wildcard characters in LIKE */
if (patt[pos] == '%' ||
patt[pos] == '_')
break;
/* Backslash quotes the next character */
if (patt[pos] == '\\')
{
pos++;
if (patt[pos] == '\0')
break;
}
/*
* NOTE: this code used to think that %% meant a literal %, but
* textlike() itself does not think that, and the SQL92 spec
* doesn't say any such thing either.
*/
match[match_pos++] = patt[pos];
}
match[match_pos] = '\0';
*rest = &patt[pos];
/* in LIKE, an empty pattern is an exact match! */
if (patt[pos] == '\0')
return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
if (match_pos > 0)
return Pattern_Prefix_Partial;
pfree(match);
*prefix = NULL;
return Pattern_Prefix_None;
}
static Pattern_Prefix_Status
regex_fixed_prefix(char *patt, bool case_insensitive,
char **prefix, char **rest)
{
char *match;
int pos,
match_pos,
paren_depth;
/* Pattern must be anchored left */
if (patt[0] != '^')
{
*prefix = NULL;
*rest = patt;
return Pattern_Prefix_None;
}
/* If unquoted | is present at paren level 0 in pattern, then there
* are multiple alternatives for the start of the string.
*/
paren_depth = 0;
for (pos = 1; patt[pos]; pos++)
{
if (patt[pos] == '|' && paren_depth == 0)
{
*prefix = NULL;
*rest = patt;
return Pattern_Prefix_None;
}
else if (patt[pos] == '(')
paren_depth++;
else if (patt[pos] == ')' && paren_depth > 0)
paren_depth--;
else if (patt[pos] == '\\')
{
/* backslash quotes the next character */
pos++;
if (patt[pos] == '\0')
break;
}
}
/* OK, allocate space for pattern */
*prefix = match = palloc(strlen(patt) + 1);
match_pos = 0;
/* note start at pos 1 to skip leading ^ */
for (pos = 1; patt[pos]; pos++)
{
/*
* Check for characters that indicate multiple possible matches here.
* XXX I suspect isalpha() is not an adequately locale-sensitive
* test for characters that can vary under case folding?
*/
if (patt[pos] == '.' ||
patt[pos] == '(' ||
patt[pos] == '[' ||
patt[pos] == '$' ||
(case_insensitive && isalpha(patt[pos])))
break;
/*
* Check for quantifiers. Except for +, this means the preceding
* character is optional, so we must remove it from the prefix too!
*/
if (patt[pos] == '*' ||
patt[pos] == '?' ||
patt[pos] == '{')
{
if (match_pos > 0)
match_pos--;
pos--;
break;
}
if (patt[pos] == '+')
{
pos--;
break;
}
if (patt[pos] == '\\')
{
/* backslash quotes the next character */
pos++;
if (patt[pos] == '\0')
break;
}
match[match_pos++] = patt[pos];
}
match[match_pos] = '\0';
*rest = &patt[pos];
if (patt[pos] == '$' && patt[pos + 1] == '\0')
{
*rest = &patt[pos + 1];
return Pattern_Prefix_Exact; /* pattern specifies exact match */
}
if (match_pos > 0)
return Pattern_Prefix_Partial;
pfree(match);
*prefix = NULL;
return Pattern_Prefix_None;
}
Pattern_Prefix_Status
pattern_fixed_prefix(char *patt, Pattern_Type ptype,
char **prefix, char **rest)
{
Pattern_Prefix_Status result;
switch (ptype)
{
case Pattern_Type_Like:
result = like_fixed_prefix(patt, prefix, rest);
break;
case Pattern_Type_Regex:
result = regex_fixed_prefix(patt, false, prefix, rest);
break;
case Pattern_Type_Regex_IC:
result = regex_fixed_prefix(patt, true, prefix, rest);
break;
default:
elog(ERROR, "pattern_fixed_prefix: bogus ptype");
result = Pattern_Prefix_None; /* keep compiler quiet */
break;
}
return result;
}
/*
* Estimate the selectivity of a fixed prefix for a pattern match.
*
* A fixed prefix "foo" is estimated as the selectivity of the expression
* "var >= 'foo' AND var < 'fop'" (see also indxqual.c).
*/
static Selectivity
prefix_selectivity(char *prefix,
Oid relid,
AttrNumber attno,
Oid datatype)
{
Selectivity prefixsel;
Oid cmpopr;
Datum prefixcon;
char *greaterstr;
cmpopr = find_operator(">=", datatype);
if (cmpopr == InvalidOid)
elog(ERROR, "prefix_selectivity: no >= operator for type %u",
datatype);
prefixcon = string_to_datum(prefix, datatype);
/* Assume scalargtsel is appropriate for all supported types */
prefixsel = * scalargtsel(cmpopr, relid, attno,
prefixcon, SEL_CONSTANT|SEL_RIGHT);
pfree(DatumGetPointer(prefixcon));
/*
* If we can create a string larger than the prefix,
* say "x < greaterstr".
*/
greaterstr = make_greater_string(prefix, datatype);
if (greaterstr)
{
Selectivity topsel;
cmpopr = find_operator("<", datatype);
if (cmpopr == InvalidOid)
elog(ERROR, "prefix_selectivity: no < operator for type %u",
datatype);
prefixcon = string_to_datum(greaterstr, datatype);
/* Assume scalarltsel is appropriate for all supported types */
topsel = * scalarltsel(cmpopr, relid, attno,
prefixcon, SEL_CONSTANT|SEL_RIGHT);
pfree(DatumGetPointer(prefixcon));
pfree(greaterstr);
/*
* Merge the two selectivities in the same way as for
* a range query (see clauselist_selectivity()).
*/
prefixsel = topsel + prefixsel - 1.0;
/*
* A zero or slightly negative prefixsel should be converted into a
* small positive value; we probably are dealing with a very
* tight range and got a bogus result due to roundoff errors.
* However, if prefixsel is very negative, then we probably have
* default selectivity estimates on one or both sides of the
* range. In that case, insert a not-so-wildly-optimistic
* default estimate.
*/
if (prefixsel <= 0.0)
{
if (prefixsel < -0.01)
{
/*
* No data available --- use a default estimate that
* is small, but not real small.
*/
prefixsel = 0.01;
}
else
{
/*
* It's just roundoff error; use a small positive value
*/
prefixsel = 1.0e-10;
}
}
}
return prefixsel;
}
/*
* Estimate the selectivity of a pattern of the specified type.
* Note that any fixed prefix of the pattern will have been removed already.
*
* For now, we use a very simplistic approach: fixed characters reduce the
* selectivity a good deal, character ranges reduce it a little,
* wildcards (such as % for LIKE or .* for regex) increase it.
*/
#define FIXED_CHAR_SEL 0.04 /* about 1/25 */
#define CHAR_RANGE_SEL 0.25
#define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */
#define FULL_WILDCARD_SEL 5.0
#define PARTIAL_WILDCARD_SEL 2.0
static Selectivity
like_selectivity(char *patt)
{
Selectivity sel = 1.0;
int pos;
/* Skip any leading %; it's already factored into initial sel */
pos = (*patt == '%') ? 1 : 0;
for (; patt[pos]; pos++)
{
/* % and _ are wildcard characters in LIKE */
if (patt[pos] == '%')
sel *= FULL_WILDCARD_SEL;
else if (patt[pos] == '_')
sel *= ANY_CHAR_SEL;
else if (patt[pos] == '\\')
{
/* Backslash quotes the next character */
pos++;
if (patt[pos] == '\0')
break;
sel *= FIXED_CHAR_SEL;
}
else
sel *= FIXED_CHAR_SEL;
}
/* Could get sel > 1 if multiple wildcards */
if (sel > 1.0)
sel = 1.0;
return sel;
}
static Selectivity
regex_selectivity_sub(char *patt, int pattlen, bool case_insensitive)
{
Selectivity sel = 1.0;
int paren_depth = 0;
int paren_pos = 0; /* dummy init to keep compiler quiet */
int pos;
for (pos = 0; pos < pattlen; pos++)
{
if (patt[pos] == '(')
{
if (paren_depth == 0)
paren_pos = pos; /* remember start of parenthesized item */
paren_depth++;
}
else if (patt[pos] == ')' && paren_depth > 0)
{
paren_depth--;
if (paren_depth == 0)
sel *= regex_selectivity_sub(patt + (paren_pos + 1),
pos - (paren_pos + 1),
case_insensitive);
}
else if (patt[pos] == '|' && paren_depth == 0)
{
/*
* If unquoted | is present at paren level 0 in pattern,
* we have multiple alternatives; sum their probabilities.
*/
sel += regex_selectivity_sub(patt + (pos + 1),
pattlen - (pos + 1),
case_insensitive);
break; /* rest of pattern is now processed */
}
else if (patt[pos] == '[')
{
bool negclass = false;
if (patt[++pos] == '^')
{
negclass = true;
pos++;
}
if (patt[pos] == ']') /* ']' at start of class is not special */
pos++;
while (pos < pattlen && patt[pos] != ']')
pos++;
if (paren_depth == 0)
sel *= (negclass ? (1.0-CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
}
else if (patt[pos] == '.')
{
if (paren_depth == 0)
sel *= ANY_CHAR_SEL;
}
else if (patt[pos] == '*' ||
patt[pos] == '?' ||
patt[pos] == '+')
{
/* Ought to be smarter about quantifiers... */
if (paren_depth == 0)
sel *= PARTIAL_WILDCARD_SEL;
}
else if (patt[pos] == '{')
{
while (pos < pattlen && patt[pos] != '}')
pos++;
if (paren_depth == 0)
sel *= PARTIAL_WILDCARD_SEL;
}
else if (patt[pos] == '\\')
{
/* backslash quotes the next character */
pos++;
if (pos >= pattlen)
break;
if (paren_depth == 0)
sel *= FIXED_CHAR_SEL;
}
else
{
if (paren_depth == 0)
sel *= FIXED_CHAR_SEL;
}
}
/* Could get sel > 1 if multiple wildcards */
if (sel > 1.0)
sel = 1.0;
return sel;
}
static Selectivity
regex_selectivity(char *patt, bool case_insensitive)
{
Selectivity sel;
int pattlen = strlen(patt);
/* If patt doesn't end with $, consider it to have a trailing wildcard */
if (pattlen > 0 && patt[pattlen-1] == '$' &&
(pattlen == 1 || patt[pattlen-2] != '\\'))
{
/* has trailing $ */
sel = regex_selectivity_sub(patt, pattlen-1, case_insensitive);
}
else
{
/* no trailing $ */
sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
sel *= FULL_WILDCARD_SEL;
if (sel > 1.0)
sel = 1.0;
}
return sel;
}
static Selectivity
pattern_selectivity(char *patt, Pattern_Type ptype)
{
Selectivity result;
switch (ptype)
{
case Pattern_Type_Like:
result = like_selectivity(patt);
break;
case Pattern_Type_Regex:
result = regex_selectivity(patt, false);
break;
case Pattern_Type_Regex_IC:
result = regex_selectivity(patt, true);
break;
default:
elog(ERROR, "pattern_selectivity: bogus ptype");
result = 1.0; /* keep compiler quiet */
break;
}
return result;
}
/*
* Try to generate a string greater than the given string or any string it is
* a prefix of. If successful, return a palloc'd string; else return NULL.
*
* To work correctly in non-ASCII locales with weird collation orders,
* we cannot simply increment "foo" to "fop" --- we have to check whether
* we actually produced a string greater than the given one. If not,
* increment the righthand byte again and repeat. If we max out the righthand
* byte, truncate off the last character and start incrementing the next.
* For example, if "z" were the last character in the sort order, then we
* could produce "foo" as a string greater than "fonz".
*
* This could be rather slow in the worst case, but in most cases we won't
* have to try more than one or two strings before succeeding.
*
* XXX in a sufficiently weird locale, this might produce incorrect results?
* For example, in German I believe "ss" is treated specially --- if we are
* given "foos" and return "foot", will this actually be greater than "fooss"?
*/
char *
make_greater_string(const char *str, Oid datatype)
{
char *workstr;
int len;
/*
* Make a modifiable copy, which will be our return value if
* successful
*/
workstr = pstrdup((char *) str);
while ((len = strlen(workstr)) > 0)
{
unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
/*
* Try to generate a larger string by incrementing the last byte.
*/
while (*lastchar < (unsigned char) 255)
{
(*lastchar)++;
if (string_lessthan(str, workstr, datatype))
return workstr; /* Success! */
}
/*
* Truncate off the last character, which might be more than 1
* byte in MULTIBYTE case.
*/
#ifdef MULTIBYTE
len = pg_mbcliplen((const unsigned char *) workstr, len, len - 1);
workstr[len] = '\0';
#else
*lastchar = '\0';
#endif
}
/* Failed... */
pfree(workstr);
return NULL;
}
/*
* Test whether two strings are "<" according to the rules of the given
* datatype. We do this the hard way, ie, actually calling the type's
* "<" operator function, to ensure we get the right result...
*/
static bool
string_lessthan(const char *str1, const char *str2, Oid datatype)
{
Datum datum1 = string_to_datum(str1, datatype);
Datum datum2 = string_to_datum(str2, datatype);
bool result;
switch (datatype)
{
case TEXTOID:
result = text_lt((text *) datum1, (text *) datum2);
break;
case BPCHAROID:
result = bpcharlt((char *) datum1, (char *) datum2);
break;
case VARCHAROID:
result = varcharlt((char *) datum1, (char *) datum2);
break;
case NAMEOID:
result = namelt((NameData *) datum1, (NameData *) datum2);
break;
default:
elog(ERROR, "string_lessthan: unexpected datatype %u", datatype);
result = false;
break;
}
pfree(DatumGetPointer(datum1));
pfree(DatumGetPointer(datum2));
return result;
}
/* See if there is a binary op of the given name for the given datatype */
static Oid
find_operator(const char *opname, Oid datatype)
{
HeapTuple optup;
optup = SearchSysCacheTuple(OPERNAME,
PointerGetDatum(opname),
ObjectIdGetDatum(datatype),
ObjectIdGetDatum(datatype),
CharGetDatum('b'));
if (!HeapTupleIsValid(optup))
return InvalidOid;
return optup->t_data->t_oid;
}
/*
* Generate a Datum of the appropriate type from a C string.
* Note that all of the supported types are pass-by-ref, so the
* returned value should be pfree'd if no longer needed.
*/
static Datum
string_to_datum(const char *str, Oid datatype)
{
/*
* We cheat a little by assuming that textin() will do for bpchar and
* varchar constants too...
*/
if (datatype == NAMEOID)
return PointerGetDatum(namein((char *) str));
else
return PointerGetDatum(textin((char *) str));
}
/*-------------------------------------------------------------------------
*
* Index cost estimation functions
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: pg_proc.h,v 1.132 2000/04/12 17:16:29 momjian Exp $
* $Id: pg_proc.h,v 1.133 2000/04/16 04:41:03 tgl Exp $
*
* NOTES
* The script catalog/genbki.sh reads this file and generates .bki
......@@ -2436,6 +2436,32 @@ DESCR("convert text to timestamp");
DATA(insert OID = 1780 ( to_date PGUID 11 f t f 2 f 1082 "25 25" 100 0 0 100 to_date - ));
DESCR("convert text to date");
/* Selectivity estimators for LIKE and related operators */
DATA(insert OID = 1818 ( regexeqsel PGUID 11 f t f 5 f 701 "26 26 21 0 23" 100 0 0 100 regexeqsel - ));
DESCR("restriction selectivity of regex match");
DATA(insert OID = 1819 ( likesel PGUID 11 f t f 5 f 701 "26 26 21 0 23" 100 0 0 100 likesel - ));
DESCR("restriction selectivity of LIKE");
DATA(insert OID = 1820 ( icregexeqsel PGUID 11 f t f 5 f 701 "26 26 21 0 23" 100 0 0 100 icregexeqsel - ));
DESCR("restriction selectivity of case-insensitive regex match");
DATA(insert OID = 1821 ( regexnesel PGUID 11 f t f 5 f 701 "26 26 21 0 23" 100 0 0 100 regexnesel - ));
DESCR("restriction selectivity of regex non-match");
DATA(insert OID = 1822 ( nlikesel PGUID 11 f t f 5 f 701 "26 26 21 0 23" 100 0 0 100 nlikesel - ));
DESCR("restriction selectivity of NOT LIKE");
DATA(insert OID = 1823 ( icregexnesel PGUID 11 f t f 5 f 701 "26 26 21 0 23" 100 0 0 100 icregexnesel - ));
DESCR("restriction selectivity of case-insensitive regex non-match");
DATA(insert OID = 1824 ( regexeqjoinsel PGUID 11 f t f 5 f 701 "26 26 21 26 21" 100 0 0 100 regexeqjoinsel - ));
DESCR("join selectivity of regex match");
DATA(insert OID = 1825 ( likejoinsel PGUID 11 f t f 5 f 701 "26 26 21 26 21" 100 0 0 100 likejoinsel - ));
DESCR("join selectivity of LIKE");
DATA(insert OID = 1826 ( icregexeqjoinsel PGUID 11 f t f 5 f 701 "26 26 21 26 21" 100 0 0 100 icregexeqjoinsel - ));
DESCR("join selectivity of case-insensitive regex match");
DATA(insert OID = 1827 ( regexnejoinsel PGUID 11 f t f 5 f 701 "26 26 21 26 21" 100 0 0 100 regexnejoinsel - ));
DESCR("join selectivity of regex non-match");
DATA(insert OID = 1828 ( nlikejoinsel PGUID 11 f t f 5 f 701 "26 26 21 26 21" 100 0 0 100 nlikejoinsel - ));
DESCR("join selectivity of NOT LIKE");
DATA(insert OID = 1829 ( icregexnejoinsel PGUID 11 f t f 5 f 701 "26 26 21 26 21" 100 0 0 100 icregexnejoinsel - ));
DESCR("join selectivity of case-insensitive regex non-match");
/*
* prototypes for functions pg_proc.c
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: builtins.h,v 1.110 2000/04/12 17:16:54 momjian Exp $
* $Id: builtins.h,v 1.111 2000/04/16 04:41:03 tgl Exp $
*
* NOTES
* This should normally only be included by fmgr.h.
......@@ -371,15 +371,47 @@ extern char *deparse_expression(Node *expr, List *rangetables,
bool forceprefix);
/* selfuncs.c */
extern float64 eqsel(Oid opid, Oid relid, AttrNumber attno, Datum value, int32 flag);
extern float64 neqsel(Oid opid, Oid relid, AttrNumber attno, Datum value, int32 flag);
extern float64 scalarltsel(Oid opid, Oid relid, AttrNumber attno, Datum value, int32 flag);
extern float64 scalargtsel(Oid opid, Oid relid, AttrNumber attno, Datum value, int32 flag);
extern float64 eqjoinsel(Oid opid, Oid relid1, AttrNumber attno1, Oid relid2, AttrNumber attno2);
extern float64 neqjoinsel(Oid opid, Oid relid1, AttrNumber attno1, Oid relid2, AttrNumber attno2);
extern float64 scalarltjoinsel(Oid opid, Oid relid1, AttrNumber attno1, Oid relid2, AttrNumber attno2);
extern float64 scalargtjoinsel(Oid opid, Oid relid1, AttrNumber attno1, Oid relid2, AttrNumber attno2);
extern bool convert_to_scalar(Datum value, Oid typid, double *scaleval);
extern float64 eqsel(Oid opid, Oid relid, AttrNumber attno,
Datum value, int32 flag);
extern float64 neqsel(Oid opid, Oid relid, AttrNumber attno,
Datum value, int32 flag);
extern float64 scalarltsel(Oid opid, Oid relid, AttrNumber attno,
Datum value, int32 flag);
extern float64 scalargtsel(Oid opid, Oid relid, AttrNumber attno,
Datum value, int32 flag);
extern float64 regexeqsel(Oid opid, Oid relid, AttrNumber attno,
Datum value, int32 flag);
extern float64 likesel(Oid opid, Oid relid, AttrNumber attno,
Datum value, int32 flag);
extern float64 icregexeqsel(Oid opid, Oid relid, AttrNumber attno,
Datum value, int32 flag);
extern float64 regexnesel(Oid opid, Oid relid, AttrNumber attno,
Datum value, int32 flag);
extern float64 nlikesel(Oid opid, Oid relid, AttrNumber attno,
Datum value, int32 flag);
extern float64 icregexnesel(Oid opid, Oid relid, AttrNumber attno,
Datum value, int32 flag);
extern float64 eqjoinsel(Oid opid, Oid relid1, AttrNumber attno1,
Oid relid2, AttrNumber attno2);
extern float64 neqjoinsel(Oid opid, Oid relid1, AttrNumber attno1,
Oid relid2, AttrNumber attno2);
extern float64 scalarltjoinsel(Oid opid, Oid relid1, AttrNumber attno1,
Oid relid2, AttrNumber attno2);
extern float64 scalargtjoinsel(Oid opid, Oid relid1, AttrNumber attno1,
Oid relid2, AttrNumber attno2);
extern float64 regexeqjoinsel(Oid opid, Oid relid1, AttrNumber attno1,
Oid relid2, AttrNumber attno2);
extern float64 likejoinsel(Oid opid, Oid relid1, AttrNumber attno1,
Oid relid2, AttrNumber attno2);
extern float64 icregexeqjoinsel(Oid opid, Oid relid1, AttrNumber attno1,
Oid relid2, AttrNumber attno2);
extern float64 regexnejoinsel(Oid opid, Oid relid1, AttrNumber attno1,
Oid relid2, AttrNumber attno2);
extern float64 nlikejoinsel(Oid opid, Oid relid1, AttrNumber attno1,
Oid relid2, AttrNumber attno2);
extern float64 icregexnejoinsel(Oid opid, Oid relid1, AttrNumber attno1,
Oid relid2, AttrNumber attno2);
extern void btcostestimate(Query *root, RelOptInfo *rel,
IndexOptInfo *index, List *indexQuals,
......@@ -402,6 +434,22 @@ extern void gistcostestimate(Query *root, RelOptInfo *rel,
Cost *indexTotalCost,
Selectivity *indexSelectivity);
typedef enum
{
Pattern_Type_Like, Pattern_Type_Regex, Pattern_Type_Regex_IC
} Pattern_Type;
typedef enum
{
Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact
} Pattern_Prefix_Status;
extern Pattern_Prefix_Status pattern_fixed_prefix(char *patt,
Pattern_Type ptype,
char **prefix,
char **rest);
extern char *make_greater_string(const char *str, Oid datatype);
/* tid.c */
extern ItemPointer tidin(const char *str);
extern char *tidout(ItemPointer itemPtr);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment