Commit 49fa99e5 authored by Tom Lane's avatar Tom Lane

Move pattern selectivity code from selfuncs.c to like_support.c.

While at it, refactor patternsel() a bit so that it can be used from
the LIKE/regex planner support functions as well.  This makes the
planner able to deal equally well with either operator or function
syntax for these operations.  I'm not excited about that as a feature
in itself, but it provides a nice model for extensions to follow if
they want such behavior for their operations.

This change localizes the use of pattern_fixed_prefix() and
make_greater_string() so that they no longer need be exported.
(We might get pushback from extensions about that, perhaps,
in which case I'd be inclined to re-export them in a new header
file like_support.h.)

This reduces the bulk of selfuncs.c a fair amount, removing ~1370
lines or about one-sixth of that file; it's still too big, but this
is progress.

Discussion: https://postgr.es/m/24537.1550093915@sss.pgh.pa.us
parent 109de05c
......@@ -34,17 +34,39 @@
*/
#include "postgres.h"
#include <math.h>
#include "access/htup_details.h"
#include "access/stratnum.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_opfamily.h"
#include "catalog/pg_statistic.h"
#include "catalog/pg_type.h"
#include "mb/pg_wchar.h"
#include "nodes/makefuncs.h"
#include "nodes/nodeFuncs.h"
#include "nodes/supportnodes.h"
#include "utils/builtins.h"
#include "utils/datum.h"
#include "utils/lsyscache.h"
#include "utils/pg_locale.h"
#include "utils/selfuncs.h"
#include "utils/varlena.h"
typedef enum
{
Pattern_Type_Like,
Pattern_Type_Like_IC,
Pattern_Type_Regex,
Pattern_Type_Regex_IC,
Pattern_Type_Prefix
} Pattern_Type;
typedef enum
{
Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact
} Pattern_Prefix_Status;
static Node *like_regex_support(Node *rawreq, Pattern_Type ptype);
static List *match_pattern_prefix(Node *leftop,
......@@ -53,6 +75,34 @@ static List *match_pattern_prefix(Node *leftop,
Oid expr_coll,
Oid opfamily,
Oid indexcollation);
static double patternsel_common(PlannerInfo *root,
Oid oprid,
Oid opfuncid,
List *args,
int varRelid,
Oid collation,
Pattern_Type ptype,
bool negate);
static Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
Pattern_Type ptype,
Oid collation,
Const **prefix,
Selectivity *rest_selec);
static Selectivity prefix_selectivity(PlannerInfo *root,
VariableStatData *vardata,
Oid vartype, Oid opfamily, Const *prefixcon);
static Selectivity like_selectivity(const char *patt, int pattlen,
bool case_insensitive);
static Selectivity regex_selectivity(const char *patt, int pattlen,
bool case_insensitive,
int fixed_prefix_len);
static int pattern_char_isalpha(char c, bool is_multibyte,
pg_locale_t locale, bool locale_is_c);
static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc,
Oid collation);
static Datum string_to_datum(const char *str, Oid datatype);
static Const *string_to_const(const char *str, Oid datatype);
static Const *string_to_bytea_const(const char *str, size_t str_len);
/*
......@@ -96,7 +146,39 @@ like_regex_support(Node *rawreq, Pattern_Type ptype)
{
Node *ret = NULL;
if (IsA(rawreq, SupportRequestIndexCondition))
if (IsA(rawreq, SupportRequestSelectivity))
{
/*
* Make a selectivity estimate for a function call, just as we'd do if
* the call was via the corresponding operator.
*/
SupportRequestSelectivity *req = (SupportRequestSelectivity *) rawreq;
Selectivity s1;
if (req->is_join)
{
/*
* For the moment we just punt. If patternjoinsel is ever
* improved to do better, this should be made to call it.
*/
s1 = DEFAULT_MATCH_SEL;
}
else
{
/* Share code with operator restriction selectivity functions */
s1 = patternsel_common(req->root,
InvalidOid,
req->funcid,
req->args,
req->varRelid,
req->inputcollid,
ptype,
false);
}
req->selectivity = s1;
ret = (Node *) req;
}
else if (IsA(rawreq, SupportRequestIndexCondition))
{
/* Try to convert operator/function call to index conditions */
SupportRequestIndexCondition *req = (SupportRequestIndexCondition *) rawreq;
......@@ -311,3 +393,1333 @@ match_pattern_prefix(Node *leftop,
return result;
}
/*
* patternsel_common - generic code for pattern-match restriction selectivity.
*
* To support using this from either the operator or function paths, caller
* may pass either operator OID or underlying function OID; we look up the
* latter from the former if needed. (We could just have patternsel() call
* get_opcode(), but the work would be wasted if we don't have a need to
* compare a fixed prefix to the pg_statistic data.)
*
* Note that oprid and/or opfuncid should be for the positive-match operator
* even when negate is true.
*/
static double
patternsel_common(PlannerInfo *root,
Oid oprid,
Oid opfuncid,
List *args,
int varRelid,
Oid collation,
Pattern_Type ptype,
bool negate)
{
VariableStatData vardata;
Node *other;
bool varonleft;
Datum constval;
Oid consttype;
Oid vartype;
Oid opfamily;
Pattern_Prefix_Status pstatus;
Const *patt;
Const *prefix = NULL;
Selectivity rest_selec = 0;
double nullfrac = 0.0;
double result;
/*
* Initialize result to the appropriate default estimate depending on
* whether it's a match or not-match operator.
*/
if (negate)
result = 1.0 - DEFAULT_MATCH_SEL;
else
result = DEFAULT_MATCH_SEL;
/*
* If expression is not variable op constant, then punt and return the
* default estimate.
*/
if (!get_restriction_variable(root, args, varRelid,
&vardata, &other, &varonleft))
return result;
if (!varonleft || !IsA(other, Const))
{
ReleaseVariableStats(vardata);
return result;
}
/*
* If the constant is NULL, assume operator is strict and return zero, ie,
* operator will never return TRUE. (It's zero even for a negator op.)
*/
if (((Const *) other)->constisnull)
{
ReleaseVariableStats(vardata);
return 0.0;
}
constval = ((Const *) other)->constvalue;
consttype = ((Const *) other)->consttype;
/*
* The right-hand const is type text or bytea for all supported operators.
* We do not expect to see binary-compatible types here, since
* const-folding should have relabeled the const to exactly match the
* operator's declared type.
*/
if (consttype != TEXTOID && consttype != BYTEAOID)
{
ReleaseVariableStats(vardata);
return result;
}
/*
* Similarly, the exposed type of the left-hand side should be one of
* those we know. (Do not look at vardata.atttype, which might be
* something binary-compatible but different.) We can use it to choose
* the index opfamily from which we must draw the comparison operators.
*
* NOTE: It would be more correct to use the PATTERN opfamilies than the
* simple ones, but at the moment ANALYZE will not generate statistics for
* the PATTERN operators. But our results are so approximate anyway that
* it probably hardly matters.
*/
vartype = vardata.vartype;
switch (vartype)
{
case TEXTOID:
case NAMEOID:
opfamily = TEXT_BTREE_FAM_OID;
break;
case BPCHAROID:
opfamily = BPCHAR_BTREE_FAM_OID;
break;
case BYTEAOID:
opfamily = BYTEA_BTREE_FAM_OID;
break;
default:
ReleaseVariableStats(vardata);
return result;
}
/*
* Grab the nullfrac for use below.
*/
if (HeapTupleIsValid(vardata.statsTuple))
{
Form_pg_statistic stats;
stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
nullfrac = stats->stanullfrac;
}
/*
* Pull out any fixed prefix implied by the pattern, and estimate the
* fractional selectivity of the remainder of the pattern. Unlike many
* other selectivity estimators, we use the pattern operator's actual
* collation for this step. This is not because we expect the collation
* to make a big difference in the selectivity estimate (it seldom would),
* but because we want to be sure we cache compiled regexps under the
* right cache key, so that they can be re-used at runtime.
*/
patt = (Const *) other;
pstatus = pattern_fixed_prefix(patt, ptype, collation,
&prefix, &rest_selec);
/*
* If necessary, coerce the prefix constant to the right type.
*/
if (prefix && prefix->consttype != vartype)
{
char *prefixstr;
switch (prefix->consttype)
{
case TEXTOID:
prefixstr = TextDatumGetCString(prefix->constvalue);
break;
case BYTEAOID:
prefixstr = DatumGetCString(DirectFunctionCall1(byteaout,
prefix->constvalue));
break;
default:
elog(ERROR, "unrecognized consttype: %u",
prefix->consttype);
ReleaseVariableStats(vardata);
return result;
}
prefix = string_to_const(prefixstr, vartype);
pfree(prefixstr);
}
if (pstatus == Pattern_Prefix_Exact)
{
/*
* Pattern specifies an exact match, so pretend operator is '='
*/
Oid eqopr = get_opfamily_member(opfamily, vartype, vartype,
BTEqualStrategyNumber);
if (eqopr == InvalidOid)
elog(ERROR, "no = operator for opfamily %u", opfamily);
result = var_eq_const(&vardata, eqopr, prefix->constvalue,
false, true, false);
}
else
{
/*
* Not exact-match pattern. If we have a sufficiently large
* histogram, estimate selectivity for the histogram part of the
* population by counting matches in the histogram. If not, estimate
* selectivity of the fixed prefix and remainder of pattern
* separately, then combine the two to get an estimate of the
* selectivity for the part of the column population represented by
* the histogram. (For small histograms, we combine these
* approaches.)
*
* We then add up data for any most-common-values values; these are
* not in the histogram population, and we can get exact answers for
* them by applying the pattern operator, so there's no reason to
* approximate. (If the MCVs cover a significant part of the total
* population, this gives us a big leg up in accuracy.)
*/
Selectivity selec;
int hist_size;
FmgrInfo opproc;
double mcv_selec,
sumcommon;
/* Try to use the histogram entries to get selectivity */
if (!OidIsValid(opfuncid))
opfuncid = get_opcode(oprid);
fmgr_info(opfuncid, &opproc);
selec = histogram_selectivity(&vardata, &opproc, constval, true,
10, 1, &hist_size);
/* If not at least 100 entries, use the heuristic method */
if (hist_size < 100)
{
Selectivity heursel;
Selectivity prefixsel;
if (pstatus == Pattern_Prefix_Partial)
prefixsel = prefix_selectivity(root, &vardata, vartype,
opfamily, prefix);
else
prefixsel = 1.0;
heursel = prefixsel * rest_selec;
if (selec < 0) /* fewer than 10 histogram entries? */
selec = heursel;
else
{
/*
* For histogram sizes from 10 to 100, we combine the
* histogram and heuristic selectivities, putting increasingly
* more trust in the histogram for larger sizes.
*/
double hist_weight = hist_size / 100.0;
selec = selec * hist_weight + heursel * (1.0 - hist_weight);
}
}
/* In any case, don't believe extremely small or large estimates. */
if (selec < 0.0001)
selec = 0.0001;
else if (selec > 0.9999)
selec = 0.9999;
/*
* If we have most-common-values info, add up the fractions of the MCV
* entries that satisfy MCV OP PATTERN. These fractions contribute
* directly to the result selectivity. Also add up the total fraction
* represented by MCV entries.
*/
mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true,
&sumcommon);
/*
* Now merge the results from the MCV and histogram calculations,
* realizing that the histogram covers only the non-null values that
* are not listed in MCV.
*/
selec *= 1.0 - nullfrac - sumcommon;
selec += mcv_selec;
result = selec;
}
/* now adjust if we wanted not-match rather than match */
if (negate)
result = 1.0 - result - nullfrac;
/* result should be in range, but make sure... */
CLAMP_PROBABILITY(result);
if (prefix)
{
pfree(DatumGetPointer(prefix->constvalue));
pfree(prefix);
}
ReleaseVariableStats(vardata);
return result;
}
/*
* Fix impedance mismatch between SQL-callable functions and patternsel_common
*/
static double
patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
{
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
Oid operator = PG_GETARG_OID(1);
List *args = (List *) PG_GETARG_POINTER(2);
int varRelid = PG_GETARG_INT32(3);
Oid collation = PG_GET_COLLATION();
/*
* If this is for a NOT LIKE or similar operator, get the corresponding
* positive-match operator and work with that.
*/
if (negate)
{
operator = get_negator(operator);
if (!OidIsValid(operator))
elog(ERROR, "patternsel called for operator without a negator");
}
return patternsel_common(root,
operator,
InvalidOid,
args,
varRelid,
collation,
ptype,
negate);
}
/*
* regexeqsel - Selectivity of regular-expression pattern match.
*/
Datum
regexeqsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, false));
}
/*
* icregexeqsel - Selectivity of case-insensitive regex match.
*/
Datum
icregexeqsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, false));
}
/*
* likesel - Selectivity of LIKE pattern match.
*/
Datum
likesel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, false));
}
/*
* prefixsel - selectivity of prefix operator
*/
Datum
prefixsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Prefix, false));
}
/*
*
* iclikesel - Selectivity of ILIKE pattern match.
*/
Datum
iclikesel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, false));
}
/*
* regexnesel - Selectivity of regular-expression pattern non-match.
*/
Datum
regexnesel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, true));
}
/*
* icregexnesel - Selectivity of case-insensitive regex non-match.
*/
Datum
icregexnesel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, true));
}
/*
* nlikesel - Selectivity of LIKE pattern non-match.
*/
Datum
nlikesel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, true));
}
/*
* icnlikesel - Selectivity of ILIKE pattern non-match.
*/
Datum
icnlikesel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true));
}
/*
* patternjoinsel - Generic code for pattern-match join selectivity.
*/
static double
patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
{
/* For the moment we just punt. */
return negate ? (1.0 - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL;
}
/*
* regexeqjoinsel - Join selectivity of regular-expression pattern match.
*/
Datum
regexeqjoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, false));
}
/*
* icregexeqjoinsel - Join selectivity of case-insensitive regex match.
*/
Datum
icregexeqjoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, false));
}
/*
* likejoinsel - Join selectivity of LIKE pattern match.
*/
Datum
likejoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, false));
}
/*
* prefixjoinsel - Join selectivity of prefix operator
*/
Datum
prefixjoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Prefix, false));
}
/*
* iclikejoinsel - Join selectivity of ILIKE pattern match.
*/
Datum
iclikejoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, false));
}
/*
* regexnejoinsel - Join selectivity of regex non-match.
*/
Datum
regexnejoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, true));
}
/*
* icregexnejoinsel - Join selectivity of case-insensitive regex non-match.
*/
Datum
icregexnejoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, true));
}
/*
* nlikejoinsel - Join selectivity of LIKE pattern non-match.
*/
Datum
nlikejoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, true));
}
/*
* icnlikejoinsel - Join selectivity of ILIKE pattern non-match.
*/
Datum
icnlikejoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, true));
}
/*-------------------------------------------------------------------------
*
* Pattern analysis functions
*
* These routines support analysis of LIKE and regular-expression patterns
* by the planner/optimizer. It's important that they agree with the
* regular-expression code in backend/regex/ and the LIKE code in
* backend/utils/adt/like.c. Also, the computation of the fixed prefix
* must be conservative: if we report a string longer than the true fixed
* prefix, the query may produce actually wrong answers, rather than just
* getting a bad selectivity estimate!
*
*-------------------------------------------------------------------------
*/
/*
* Extract the fixed prefix, if any, for a pattern.
*
* *prefix is set to a palloc'd prefix string (in the form of a Const node),
* or to NULL if no fixed prefix exists for the pattern.
* If rest_selec is not NULL, *rest_selec is set to an estimate of the
* selectivity of the remainder of the pattern (without any fixed prefix).
* The prefix Const has the same type (TEXT or BYTEA) as the input pattern.
*
* The return value distinguishes no fixed prefix, a partial prefix,
* or an exact-match-only pattern.
*/
static Pattern_Prefix_Status
like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
Const **prefix_const, Selectivity *rest_selec)
{
char *match;
char *patt;
int pattlen;
Oid typeid = patt_const->consttype;
int pos,
match_pos;
bool is_multibyte = (pg_database_encoding_max_length() > 1);
pg_locale_t locale = 0;
bool locale_is_c = false;
/* the right-hand const is type text or bytea */
Assert(typeid == BYTEAOID || typeid == TEXTOID);
if (case_insensitive)
{
if (typeid == BYTEAOID)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("case insensitive matching not supported on type bytea")));
/* If case-insensitive, we need locale info */
if (lc_ctype_is_c(collation))
locale_is_c = true;
else if (collation != DEFAULT_COLLATION_OID)
{
if (!OidIsValid(collation))
{
/*
* This typically means that the parser could not resolve a
* conflict of implicit collations, so report it that way.
*/
ereport(ERROR,
(errcode(ERRCODE_INDETERMINATE_COLLATION),
errmsg("could not determine which collation to use for ILIKE"),
errhint("Use the COLLATE clause to set the collation explicitly.")));
}
locale = pg_newlocale_from_collation(collation);
}
}
if (typeid != BYTEAOID)
{
patt = TextDatumGetCString(patt_const->constvalue);
pattlen = strlen(patt);
}
else
{
bytea *bstr = DatumGetByteaPP(patt_const->constvalue);
pattlen = VARSIZE_ANY_EXHDR(bstr);
patt = (char *) palloc(pattlen);
memcpy(patt, VARDATA_ANY(bstr), pattlen);
Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue));
}
match = palloc(pattlen + 1);
match_pos = 0;
for (pos = 0; pos < pattlen; pos++)
{
/* % and _ are wildcard characters in LIKE */
if (patt[pos] == '%' ||
patt[pos] == '_')
break;
/* Backslash escapes the next character */
if (patt[pos] == '\\')
{
pos++;
if (pos >= pattlen)
break;
}
/* Stop if case-varying character (it's sort of a wildcard) */
if (case_insensitive &&
pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
break;
match[match_pos++] = patt[pos];
}
match[match_pos] = '\0';
if (typeid != BYTEAOID)
*prefix_const = string_to_const(match, typeid);
else
*prefix_const = string_to_bytea_const(match, match_pos);
if (rest_selec != NULL)
*rest_selec = like_selectivity(&patt[pos], pattlen - pos,
case_insensitive);
pfree(patt);
pfree(match);
/* in LIKE, an empty pattern is an exact match! */
if (pos == pattlen)
return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
if (match_pos > 0)
return Pattern_Prefix_Partial;
return Pattern_Prefix_None;
}
static Pattern_Prefix_Status
regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
Const **prefix_const, Selectivity *rest_selec)
{
Oid typeid = patt_const->consttype;
char *prefix;
bool exact;
/*
* Should be unnecessary, there are no bytea regex operators defined. As
* such, it should be noted that the rest of this function has *not* been
* made safe for binary (possibly NULL containing) strings.
*/
if (typeid == BYTEAOID)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("regular-expression matching not supported on type bytea")));
/* Use the regexp machinery to extract the prefix, if any */
prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
case_insensitive, collation,
&exact);
if (prefix == NULL)
{
*prefix_const = NULL;
if (rest_selec != NULL)
{
char *patt = TextDatumGetCString(patt_const->constvalue);
*rest_selec = regex_selectivity(patt, strlen(patt),
case_insensitive,
0);
pfree(patt);
}
return Pattern_Prefix_None;
}
*prefix_const = string_to_const(prefix, typeid);
if (rest_selec != NULL)
{
if (exact)
{
/* Exact match, so there's no additional selectivity */
*rest_selec = 1.0;
}
else
{
char *patt = TextDatumGetCString(patt_const->constvalue);
*rest_selec = regex_selectivity(patt, strlen(patt),
case_insensitive,
strlen(prefix));
pfree(patt);
}
}
pfree(prefix);
if (exact)
return Pattern_Prefix_Exact; /* pattern specifies exact match */
else
return Pattern_Prefix_Partial;
}
static Pattern_Prefix_Status
pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
Const **prefix, Selectivity *rest_selec)
{
Pattern_Prefix_Status result;
switch (ptype)
{
case Pattern_Type_Like:
result = like_fixed_prefix(patt, false, collation,
prefix, rest_selec);
break;
case Pattern_Type_Like_IC:
result = like_fixed_prefix(patt, true, collation,
prefix, rest_selec);
break;
case Pattern_Type_Regex:
result = regex_fixed_prefix(patt, false, collation,
prefix, rest_selec);
break;
case Pattern_Type_Regex_IC:
result = regex_fixed_prefix(patt, true, collation,
prefix, rest_selec);
break;
case Pattern_Type_Prefix:
/* Prefix type work is trivial. */
result = Pattern_Prefix_Partial;
*rest_selec = 1.0; /* all */
*prefix = makeConst(patt->consttype,
patt->consttypmod,
patt->constcollid,
patt->constlen,
datumCopy(patt->constvalue,
patt->constbyval,
patt->constlen),
patt->constisnull,
patt->constbyval);
break;
default:
elog(ERROR, "unrecognized ptype: %d", (int) ptype);
result = Pattern_Prefix_None; /* keep compiler quiet */
break;
}
return result;
}
/*
* Estimate the selectivity of a fixed prefix for a pattern match.
*
* A fixed prefix "foo" is estimated as the selectivity of the expression
* "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c).
*
* The selectivity estimate is with respect to the portion of the column
* population represented by the histogram --- the caller must fold this
* together with info about MCVs and NULLs.
*
* We use the >= and < operators from the specified btree opfamily to do the
* estimation. The given variable and Const must be of the associated
* datatype.
*
* XXX Note: we make use of the upper bound to estimate operator selectivity
* even if the locale is such that we cannot rely on the upper-bound string.
* The selectivity only needs to be approximately right anyway, so it seems
* more useful to use the upper-bound code than not.
*/
static Selectivity
prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
Oid vartype, Oid opfamily, Const *prefixcon)
{
Selectivity prefixsel;
Oid cmpopr;
FmgrInfo opproc;
AttStatsSlot sslot;
Const *greaterstrcon;
Selectivity eq_sel;
cmpopr = get_opfamily_member(opfamily, vartype, vartype,
BTGreaterEqualStrategyNumber);
if (cmpopr == InvalidOid)
elog(ERROR, "no >= operator for opfamily %u", opfamily);
fmgr_info(get_opcode(cmpopr), &opproc);
prefixsel = ineq_histogram_selectivity(root, vardata,
&opproc, true, true,
prefixcon->constvalue,
prefixcon->consttype);
if (prefixsel < 0.0)
{
/* No histogram is present ... return a suitable default estimate */
return DEFAULT_MATCH_SEL;
}
/*-------
* If we can create a string larger than the prefix, say
* "x < greaterstr". We try to generate the string referencing the
* collation of the var's statistics, but if that's not available,
* use DEFAULT_COLLATION_OID.
*-------
*/
if (HeapTupleIsValid(vardata->statsTuple) &&
get_attstatsslot(&sslot, vardata->statsTuple,
STATISTIC_KIND_HISTOGRAM, InvalidOid, 0))
/* sslot.stacoll is set up */ ;
else
sslot.stacoll = DEFAULT_COLLATION_OID;
cmpopr = get_opfamily_member(opfamily, vartype, vartype,
BTLessStrategyNumber);
if (cmpopr == InvalidOid)
elog(ERROR, "no < operator for opfamily %u", opfamily);
fmgr_info(get_opcode(cmpopr), &opproc);
greaterstrcon = make_greater_string(prefixcon, &opproc, sslot.stacoll);
if (greaterstrcon)
{
Selectivity topsel;
topsel = ineq_histogram_selectivity(root, vardata,
&opproc, false, false,
greaterstrcon->constvalue,
greaterstrcon->consttype);
/* ineq_histogram_selectivity worked before, it shouldn't fail now */
Assert(topsel >= 0.0);
/*
* Merge the two selectivities in the same way as for a range query
* (see clauselist_selectivity()). Note that we don't need to worry
* about double-exclusion of nulls, since ineq_histogram_selectivity
* doesn't count those anyway.
*/
prefixsel = topsel + prefixsel - 1.0;
}
/*
* If the prefix is long then the two bounding values might be too close
* together for the histogram to distinguish them usefully, resulting in a
* zero estimate (plus or minus roundoff error). To avoid returning a
* ridiculously small estimate, compute the estimated selectivity for
* "variable = 'foo'", and clamp to that. (Obviously, the resultant
* estimate should be at least that.)
*
* We apply this even if we couldn't make a greater string. That case
* suggests that the prefix is near the maximum possible, and thus
* probably off the end of the histogram, and thus we probably got a very
* small estimate from the >= condition; so we still need to clamp.
*/
cmpopr = get_opfamily_member(opfamily, vartype, vartype,
BTEqualStrategyNumber);
if (cmpopr == InvalidOid)
elog(ERROR, "no = operator for opfamily %u", opfamily);
eq_sel = var_eq_const(vardata, cmpopr, prefixcon->constvalue,
false, true, false);
prefixsel = Max(prefixsel, eq_sel);
return prefixsel;
}
/*
* Estimate the selectivity of a pattern of the specified type.
* Note that any fixed prefix of the pattern will have been removed already,
* so actually we may be looking at just a fragment of the pattern.
*
* For now, we use a very simplistic approach: fixed characters reduce the
* selectivity a good deal, character ranges reduce it a little,
* wildcards (such as % for LIKE or .* for regex) increase it.
*/
#define FIXED_CHAR_SEL 0.20 /* about 1/5 */
#define CHAR_RANGE_SEL 0.25
#define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */
#define FULL_WILDCARD_SEL 5.0
#define PARTIAL_WILDCARD_SEL 2.0
static Selectivity
like_selectivity(const char *patt, int pattlen, bool case_insensitive)
{
Selectivity sel = 1.0;
int pos;
/* Skip any leading wildcard; it's already factored into initial sel */
for (pos = 0; pos < pattlen; pos++)
{
if (patt[pos] != '%' && patt[pos] != '_')
break;
}
for (; pos < pattlen; pos++)
{
/* % and _ are wildcard characters in LIKE */
if (patt[pos] == '%')
sel *= FULL_WILDCARD_SEL;
else if (patt[pos] == '_')
sel *= ANY_CHAR_SEL;
else if (patt[pos] == '\\')
{
/* Backslash quotes the next character */
pos++;
if (pos >= pattlen)
break;
sel *= FIXED_CHAR_SEL;
}
else
sel *= FIXED_CHAR_SEL;
}
/* Could get sel > 1 if multiple wildcards */
if (sel > 1.0)
sel = 1.0;
return sel;
}
static Selectivity
regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
{
Selectivity sel = 1.0;
int paren_depth = 0;
int paren_pos = 0; /* dummy init to keep compiler quiet */
int pos;
for (pos = 0; pos < pattlen; pos++)
{
if (patt[pos] == '(')
{
if (paren_depth == 0)
paren_pos = pos; /* remember start of parenthesized item */
paren_depth++;
}
else if (patt[pos] == ')' && paren_depth > 0)
{
paren_depth--;
if (paren_depth == 0)
sel *= regex_selectivity_sub(patt + (paren_pos + 1),
pos - (paren_pos + 1),
case_insensitive);
}
else if (patt[pos] == '|' && paren_depth == 0)
{
/*
* If unquoted | is present at paren level 0 in pattern, we have
* multiple alternatives; sum their probabilities.
*/
sel += regex_selectivity_sub(patt + (pos + 1),
pattlen - (pos + 1),
case_insensitive);
break; /* rest of pattern is now processed */
}
else if (patt[pos] == '[')
{
bool negclass = false;
if (patt[++pos] == '^')
{
negclass = true;
pos++;
}
if (patt[pos] == ']') /* ']' at start of class is not special */
pos++;
while (pos < pattlen && patt[pos] != ']')
pos++;
if (paren_depth == 0)
sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
}
else if (patt[pos] == '.')
{
if (paren_depth == 0)
sel *= ANY_CHAR_SEL;
}
else if (patt[pos] == '*' ||
patt[pos] == '?' ||
patt[pos] == '+')
{
/* Ought to be smarter about quantifiers... */
if (paren_depth == 0)
sel *= PARTIAL_WILDCARD_SEL;
}
else if (patt[pos] == '{')
{
while (pos < pattlen && patt[pos] != '}')
pos++;
if (paren_depth == 0)
sel *= PARTIAL_WILDCARD_SEL;
}
else if (patt[pos] == '\\')
{
/* backslash quotes the next character */
pos++;
if (pos >= pattlen)
break;
if (paren_depth == 0)
sel *= FIXED_CHAR_SEL;
}
else
{
if (paren_depth == 0)
sel *= FIXED_CHAR_SEL;
}
}
/* Could get sel > 1 if multiple wildcards */
if (sel > 1.0)
sel = 1.0;
return sel;
}
static Selectivity
regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
int fixed_prefix_len)
{
Selectivity sel;
/* If patt doesn't end with $, consider it to have a trailing wildcard */
if (pattlen > 0 && patt[pattlen - 1] == '$' &&
(pattlen == 1 || patt[pattlen - 2] != '\\'))
{
/* has trailing $ */
sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
}
else
{
/* no trailing $ */
sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
sel *= FULL_WILDCARD_SEL;
}
/* If there's a fixed prefix, discount its selectivity */
if (fixed_prefix_len > 0)
sel /= pow(FIXED_CHAR_SEL, fixed_prefix_len);
/* Make sure result stays in range */
CLAMP_PROBABILITY(sel);
return sel;
}
/*
* Check whether char is a letter (and, hence, subject to case-folding)
*
* In multibyte character sets or with ICU, we can't use isalpha, and it does
* not seem worth trying to convert to wchar_t to use iswalpha. Instead, just
* assume any multibyte char is potentially case-varying.
*/
static int
pattern_char_isalpha(char c, bool is_multibyte,
pg_locale_t locale, bool locale_is_c)
{
if (locale_is_c)
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
else if (is_multibyte && IS_HIGHBIT_SET(c))
return true;
else if (locale && locale->provider == COLLPROVIDER_ICU)
return IS_HIGHBIT_SET(c) ? true : false;
#ifdef HAVE_LOCALE_T
else if (locale && locale->provider == COLLPROVIDER_LIBC)
return isalpha_l((unsigned char) c, locale->info.lt);
#endif
else
return isalpha((unsigned char) c);
}
/*
* For bytea, the increment function need only increment the current byte
* (there are no multibyte characters to worry about).
*/
static bool
byte_increment(unsigned char *ptr, int len)
{
if (*ptr >= 255)
return false;
(*ptr)++;
return true;
}
/*
* Try to generate a string greater than the given string or any
* string it is a prefix of. If successful, return a palloc'd string
* in the form of a Const node; else return NULL.
*
* The caller must provide the appropriate "less than" comparison function
* for testing the strings, along with the collation to use.
*
* The key requirement here is that given a prefix string, say "foo",
* we must be able to generate another string "fop" that is greater than
* all strings "foobar" starting with "foo". We can test that we have
* generated a string greater than the prefix string, but in non-C collations
* that is not a bulletproof guarantee that an extension of the string might
* not sort after it; an example is that "foo " is less than "foo!", but it
* is not clear that a "dictionary" sort ordering will consider "foo!" less
* than "foo bar". CAUTION: Therefore, this function should be used only for
* estimation purposes when working in a non-C collation.
*
* To try to catch most cases where an extended string might otherwise sort
* before the result value, we determine which of the strings "Z", "z", "y",
* and "9" is seen as largest by the collation, and append that to the given
* prefix before trying to find a string that compares as larger.
*
* To search for a greater string, we repeatedly "increment" the rightmost
* character, using an encoding-specific character incrementer function.
* When it's no longer possible to increment the last character, we truncate
* off that character and start incrementing the next-to-rightmost.
* For example, if "z" were the last character in the sort order, then we
* could produce "foo" as a string greater than "fonz".
*
* This could be rather slow in the worst case, but in most cases we
* won't have to try more than one or two strings before succeeding.
*
* Note that it's important for the character incrementer not to be too anal
* about producing every possible character code, since in some cases the only
* way to get a larger string is to increment a previous character position.
* So we don't want to spend too much time trying every possible character
* code at the last position. A good rule of thumb is to be sure that we
* don't try more than 256*K values for a K-byte character (and definitely
* not 256^K, which is what an exhaustive search would approach).
*/
static Const *
make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
{
Oid datatype = str_const->consttype;
char *workstr;
int len;
Datum cmpstr;
char *cmptxt = NULL;
mbcharacter_incrementer charinc;
/*
* Get a modifiable copy of the prefix string in C-string format, and set
* up the string we will compare to as a Datum. In C locale this can just
* be the given prefix string, otherwise we need to add a suffix. Type
* BYTEA sorts bytewise so it never needs a suffix either.
*/
if (datatype == BYTEAOID)
{
bytea *bstr = DatumGetByteaPP(str_const->constvalue);
len = VARSIZE_ANY_EXHDR(bstr);
workstr = (char *) palloc(len);
memcpy(workstr, VARDATA_ANY(bstr), len);
Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue));
cmpstr = str_const->constvalue;
}
else
{
if (datatype == NAMEOID)
workstr = DatumGetCString(DirectFunctionCall1(nameout,
str_const->constvalue));
else
workstr = TextDatumGetCString(str_const->constvalue);
len = strlen(workstr);
if (lc_collate_is_c(collation) || len == 0)
cmpstr = str_const->constvalue;
else
{
/* If first time through, determine the suffix to use */
static char suffixchar = 0;
static Oid suffixcollation = 0;
if (!suffixchar || suffixcollation != collation)
{
char *best;
best = "Z";
if (varstr_cmp(best, 1, "z", 1, collation) < 0)
best = "z";
if (varstr_cmp(best, 1, "y", 1, collation) < 0)
best = "y";
if (varstr_cmp(best, 1, "9", 1, collation) < 0)
best = "9";
suffixchar = *best;
suffixcollation = collation;
}
/* And build the string to compare to */
if (datatype == NAMEOID)
{
cmptxt = palloc(len + 2);
memcpy(cmptxt, workstr, len);
cmptxt[len] = suffixchar;
cmptxt[len + 1] = '\0';
cmpstr = PointerGetDatum(cmptxt);
}
else
{
cmptxt = palloc(VARHDRSZ + len + 1);
SET_VARSIZE(cmptxt, VARHDRSZ + len + 1);
memcpy(VARDATA(cmptxt), workstr, len);
*(VARDATA(cmptxt) + len) = suffixchar;
cmpstr = PointerGetDatum(cmptxt);
}
}
}
/* Select appropriate character-incrementer function */
if (datatype == BYTEAOID)
charinc = byte_increment;
else
charinc = pg_database_encoding_character_incrementer();
/* And search ... */
while (len > 0)
{
int charlen;
unsigned char *lastchar;
/* Identify the last character --- for bytea, just the last byte */
if (datatype == BYTEAOID)
charlen = 1;
else
charlen = len - pg_mbcliplen(workstr, len, len - 1);
lastchar = (unsigned char *) (workstr + len - charlen);
/*
* Try to generate a larger string by incrementing the last character
* (for BYTEA, we treat each byte as a character).
*
* Note: the incrementer function is expected to return true if it's
* generated a valid-per-the-encoding new character, otherwise false.
* The contents of the character on false return are unspecified.
*/
while (charinc(lastchar, charlen))
{
Const *workstr_const;
if (datatype == BYTEAOID)
workstr_const = string_to_bytea_const(workstr, len);
else
workstr_const = string_to_const(workstr, datatype);
if (DatumGetBool(FunctionCall2Coll(ltproc,
collation,
cmpstr,
workstr_const->constvalue)))
{
/* Successfully made a string larger than cmpstr */
if (cmptxt)
pfree(cmptxt);
pfree(workstr);
return workstr_const;
}
/* No good, release unusable value and try again */
pfree(DatumGetPointer(workstr_const->constvalue));
pfree(workstr_const);
}
/*
* No luck here, so truncate off the last character and try to
* increment the next one.
*/
len -= charlen;
workstr[len] = '\0';
}
/* Failed... */
if (cmptxt)
pfree(cmptxt);
pfree(workstr);
return NULL;
}
/*
* Generate a Datum of the appropriate type from a C string.
* Note that all of the supported types are pass-by-ref, so the
* returned value should be pfree'd if no longer needed.
*/
static Datum
string_to_datum(const char *str, Oid datatype)
{
Assert(str != NULL);
/*
* We cheat a little by assuming that CStringGetTextDatum() will do for
* bpchar and varchar constants too...
*/
if (datatype == NAMEOID)
return DirectFunctionCall1(namein, CStringGetDatum(str));
else if (datatype == BYTEAOID)
return DirectFunctionCall1(byteain, CStringGetDatum(str));
else
return CStringGetTextDatum(str);
}
/*
* Generate a Const node of the appropriate type from a C string.
*/
static Const *
string_to_const(const char *str, Oid datatype)
{
Datum conval = string_to_datum(str, datatype);
Oid collation;
int constlen;
/*
* We only need to support a few datatypes here, so hard-wire properties
* instead of incurring the expense of catalog lookups.
*/
switch (datatype)
{
case TEXTOID:
case VARCHAROID:
case BPCHAROID:
collation = DEFAULT_COLLATION_OID;
constlen = -1;
break;
case NAMEOID:
collation = C_COLLATION_OID;
constlen = NAMEDATALEN;
break;
case BYTEAOID:
collation = InvalidOid;
constlen = -1;
break;
default:
elog(ERROR, "unexpected datatype in string_to_const: %u",
datatype);
return NULL;
}
return makeConst(datatype, -1, collation, constlen,
conval, false, false);
}
/*
* Generate a Const node of bytea type from a binary C string and a length.
*/
static Const *
string_to_bytea_const(const char *str, size_t str_len)
{
bytea *bstr = palloc(VARHDRSZ + str_len);
Datum conval;
memcpy(VARDATA(bstr), str, str_len);
SET_VARSIZE(bstr, VARHDRSZ + str_len);
conval = PointerGetDatum(bstr);
return makeConst(BYTEAOID, -1, InvalidOid, -1, conval, false, false);
}
......@@ -110,12 +110,9 @@
#include "catalog/pg_am.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_opfamily.h"
#include "catalog/pg_statistic.h"
#include "catalog/pg_statistic_ext.h"
#include "catalog/pg_type.h"
#include "executor/executor.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "nodes/makefuncs.h"
#include "nodes/nodeFuncs.h"
......@@ -125,14 +122,10 @@
#include "optimizer/pathnode.h"
#include "optimizer/paths.h"
#include "optimizer/plancat.h"
#include "optimizer/restrictinfo.h"
#include "parser/parse_clause.h"
#include "parser/parse_coerce.h"
#include "parser/parsetree.h"
#include "statistics/statistics.h"
#include "utils/acl.h"
#include "utils/builtins.h"
#include "utils/bytea.h"
#include "utils/date.h"
#include "utils/datum.h"
#include "utils/fmgroids.h"
......@@ -146,7 +139,6 @@
#include "utils/syscache.h"
#include "utils/timestamp.h"
#include "utils/typcache.h"
#include "utils/varlena.h"
/* Hooks for plugins to get control when we ask for stats */
......@@ -154,16 +146,6 @@ get_relation_stats_hook_type get_relation_stats_hook = NULL;
get_index_stats_hook_type get_index_stats_hook = NULL;
static double eqsel_internal(PG_FUNCTION_ARGS, bool negate);
static double var_eq_const(VariableStatData *vardata, Oid operator,
Datum constval, bool constisnull,
bool varonleft, bool negate);
static double var_eq_non_const(VariableStatData *vardata, Oid operator,
Node *other,
bool varonleft, bool negate);
static double ineq_histogram_selectivity(PlannerInfo *root,
VariableStatData *vardata,
FmgrInfo *opproc, bool isgt, bool iseq,
Datum constval, Oid consttype);
static double eqjoinsel_inner(Oid opfuncoid,
VariableStatData *vardata1, VariableStatData *vardata2,
double nd1, double nd2,
......@@ -215,17 +197,6 @@ static bool get_actual_variable_range(PlannerInfo *root,
Oid sortop,
Datum *min, Datum *max);
static RelOptInfo *find_join_input_rel(PlannerInfo *root, Relids relids);
static Selectivity prefix_selectivity(PlannerInfo *root,
VariableStatData *vardata,
Oid vartype, Oid opfamily, Const *prefixcon);
static Selectivity like_selectivity(const char *patt, int pattlen,
bool case_insensitive);
static Selectivity regex_selectivity(const char *patt, int pattlen,
bool case_insensitive,
int fixed_prefix_len);
static Datum string_to_datum(const char *str, Oid datatype);
static Const *string_to_const(const char *str, Oid datatype);
static Const *string_to_bytea_const(const char *str, size_t str_len);
static IndexQualInfo *deconstruct_indexqual(RestrictInfo *rinfo,
IndexOptInfo *index, int indexcol);
static List *add_predicate_to_quals(IndexOptInfo *index, List *indexQuals);
......@@ -304,9 +275,9 @@ eqsel_internal(PG_FUNCTION_ARGS, bool negate)
/*
* var_eq_const --- eqsel for var = const case
*
* This is split out so that some other estimation functions can use it.
* This is exported so that some other estimation functions can use it.
*/
static double
double
var_eq_const(VariableStatData *vardata, Oid operator,
Datum constval, bool constisnull,
bool varonleft, bool negate)
......@@ -457,8 +428,10 @@ var_eq_const(VariableStatData *vardata, Oid operator,
/*
* var_eq_non_const --- eqsel for var = something-other-than-const case
*
* This is exported so that some other estimation functions can use it.
*/
static double
double
var_eq_non_const(VariableStatData *vardata, Oid operator,
Node *other,
bool varonleft, bool negate)
......@@ -784,8 +757,10 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
* Note that the result disregards both the most-common-values (if any) and
* null entries. The caller is expected to combine this result with
* statistics for those portions of the column population.
*
* This is exported so that some other estimation functions can use it.
*/
static double
double
ineq_histogram_selectivity(PlannerInfo *root,
VariableStatData *vardata,
FmgrInfo *opproc, bool isgt, bool iseq,
......@@ -1198,361 +1173,6 @@ scalargesel(PG_FUNCTION_ARGS)
return scalarineqsel_wrapper(fcinfo, true, true);
}
/*
* patternsel - Generic code for pattern-match selectivity.
*/
static double
patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
{
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
Oid operator = PG_GETARG_OID(1);
List *args = (List *) PG_GETARG_POINTER(2);
int varRelid = PG_GETARG_INT32(3);
Oid collation = PG_GET_COLLATION();
VariableStatData vardata;
Node *other;
bool varonleft;
Datum constval;
Oid consttype;
Oid vartype;
Oid opfamily;
Pattern_Prefix_Status pstatus;
Const *patt;
Const *prefix = NULL;
Selectivity rest_selec = 0;
double nullfrac = 0.0;
double result;
/*
* If this is for a NOT LIKE or similar operator, get the corresponding
* positive-match operator and work with that. Set result to the correct
* default estimate, too.
*/
if (negate)
{
operator = get_negator(operator);
if (!OidIsValid(operator))
elog(ERROR, "patternsel called for operator without a negator");
result = 1.0 - DEFAULT_MATCH_SEL;
}
else
{
result = DEFAULT_MATCH_SEL;
}
/*
* If expression is not variable op constant, then punt and return a
* default estimate.
*/
if (!get_restriction_variable(root, args, varRelid,
&vardata, &other, &varonleft))
return result;
if (!varonleft || !IsA(other, Const))
{
ReleaseVariableStats(vardata);
return result;
}
/*
* If the constant is NULL, assume operator is strict and return zero, ie,
* operator will never return TRUE. (It's zero even for a negator op.)
*/
if (((Const *) other)->constisnull)
{
ReleaseVariableStats(vardata);
return 0.0;
}
constval = ((Const *) other)->constvalue;
consttype = ((Const *) other)->consttype;
/*
* The right-hand const is type text or bytea for all supported operators.
* We do not expect to see binary-compatible types here, since
* const-folding should have relabeled the const to exactly match the
* operator's declared type.
*/
if (consttype != TEXTOID && consttype != BYTEAOID)
{
ReleaseVariableStats(vardata);
return result;
}
/*
* Similarly, the exposed type of the left-hand side should be one of
* those we know. (Do not look at vardata.atttype, which might be
* something binary-compatible but different.) We can use it to choose
* the index opfamily from which we must draw the comparison operators.
*
* NOTE: It would be more correct to use the PATTERN opfamilies than the
* simple ones, but at the moment ANALYZE will not generate statistics for
* the PATTERN operators. But our results are so approximate anyway that
* it probably hardly matters.
*/
vartype = vardata.vartype;
switch (vartype)
{
case TEXTOID:
case NAMEOID:
opfamily = TEXT_BTREE_FAM_OID;
break;
case BPCHAROID:
opfamily = BPCHAR_BTREE_FAM_OID;
break;
case BYTEAOID:
opfamily = BYTEA_BTREE_FAM_OID;
break;
default:
ReleaseVariableStats(vardata);
return result;
}
/*
* Grab the nullfrac for use below.
*/
if (HeapTupleIsValid(vardata.statsTuple))
{
Form_pg_statistic stats;
stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
nullfrac = stats->stanullfrac;
}
/*
* Pull out any fixed prefix implied by the pattern, and estimate the
* fractional selectivity of the remainder of the pattern. Unlike many of
* the other functions in this file, we use the pattern operator's actual
* collation for this step. This is not because we expect the collation
* to make a big difference in the selectivity estimate (it seldom would),
* but because we want to be sure we cache compiled regexps under the
* right cache key, so that they can be re-used at runtime.
*/
patt = (Const *) other;
pstatus = pattern_fixed_prefix(patt, ptype, collation,
&prefix, &rest_selec);
/*
* If necessary, coerce the prefix constant to the right type.
*/
if (prefix && prefix->consttype != vartype)
{
char *prefixstr;
switch (prefix->consttype)
{
case TEXTOID:
prefixstr = TextDatumGetCString(prefix->constvalue);
break;
case BYTEAOID:
prefixstr = DatumGetCString(DirectFunctionCall1(byteaout,
prefix->constvalue));
break;
default:
elog(ERROR, "unrecognized consttype: %u",
prefix->consttype);
ReleaseVariableStats(vardata);
return result;
}
prefix = string_to_const(prefixstr, vartype);
pfree(prefixstr);
}
if (pstatus == Pattern_Prefix_Exact)
{
/*
* Pattern specifies an exact match, so pretend operator is '='
*/
Oid eqopr = get_opfamily_member(opfamily, vartype, vartype,
BTEqualStrategyNumber);
if (eqopr == InvalidOid)
elog(ERROR, "no = operator for opfamily %u", opfamily);
result = var_eq_const(&vardata, eqopr, prefix->constvalue,
false, true, false);
}
else
{
/*
* Not exact-match pattern. If we have a sufficiently large
* histogram, estimate selectivity for the histogram part of the
* population by counting matches in the histogram. If not, estimate
* selectivity of the fixed prefix and remainder of pattern
* separately, then combine the two to get an estimate of the
* selectivity for the part of the column population represented by
* the histogram. (For small histograms, we combine these
* approaches.)
*
* We then add up data for any most-common-values values; these are
* not in the histogram population, and we can get exact answers for
* them by applying the pattern operator, so there's no reason to
* approximate. (If the MCVs cover a significant part of the total
* population, this gives us a big leg up in accuracy.)
*/
Selectivity selec;
int hist_size;
FmgrInfo opproc;
double mcv_selec,
sumcommon;
/* Try to use the histogram entries to get selectivity */
fmgr_info(get_opcode(operator), &opproc);
selec = histogram_selectivity(&vardata, &opproc, constval, true,
10, 1, &hist_size);
/* If not at least 100 entries, use the heuristic method */
if (hist_size < 100)
{
Selectivity heursel;
Selectivity prefixsel;
if (pstatus == Pattern_Prefix_Partial)
prefixsel = prefix_selectivity(root, &vardata, vartype,
opfamily, prefix);
else
prefixsel = 1.0;
heursel = prefixsel * rest_selec;
if (selec < 0) /* fewer than 10 histogram entries? */
selec = heursel;
else
{
/*
* For histogram sizes from 10 to 100, we combine the
* histogram and heuristic selectivities, putting increasingly
* more trust in the histogram for larger sizes.
*/
double hist_weight = hist_size / 100.0;
selec = selec * hist_weight + heursel * (1.0 - hist_weight);
}
}
/* In any case, don't believe extremely small or large estimates. */
if (selec < 0.0001)
selec = 0.0001;
else if (selec > 0.9999)
selec = 0.9999;
/*
* If we have most-common-values info, add up the fractions of the MCV
* entries that satisfy MCV OP PATTERN. These fractions contribute
* directly to the result selectivity. Also add up the total fraction
* represented by MCV entries.
*/
mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true,
&sumcommon);
/*
* Now merge the results from the MCV and histogram calculations,
* realizing that the histogram covers only the non-null values that
* are not listed in MCV.
*/
selec *= 1.0 - nullfrac - sumcommon;
selec += mcv_selec;
result = selec;
}
/* now adjust if we wanted not-match rather than match */
if (negate)
result = 1.0 - result - nullfrac;
/* result should be in range, but make sure... */
CLAMP_PROBABILITY(result);
if (prefix)
{
pfree(DatumGetPointer(prefix->constvalue));
pfree(prefix);
}
ReleaseVariableStats(vardata);
return result;
}
/*
* regexeqsel - Selectivity of regular-expression pattern match.
*/
Datum
regexeqsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, false));
}
/*
* icregexeqsel - Selectivity of case-insensitive regex match.
*/
Datum
icregexeqsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, false));
}
/*
* likesel - Selectivity of LIKE pattern match.
*/
Datum
likesel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, false));
}
/*
* prefixsel - selectivity of prefix operator
*/
Datum
prefixsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Prefix, false));
}
/*
*
* iclikesel - Selectivity of ILIKE pattern match.
*/
Datum
iclikesel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, false));
}
/*
* regexnesel - Selectivity of regular-expression pattern non-match.
*/
Datum
regexnesel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, true));
}
/*
* icregexnesel - Selectivity of case-insensitive regex non-match.
*/
Datum
icregexnesel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, true));
}
/*
* nlikesel - Selectivity of LIKE pattern non-match.
*/
Datum
nlikesel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, true));
}
/*
* icnlikesel - Selectivity of ILIKE pattern non-match.
*/
Datum
icnlikesel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true));
}
/*
* boolvarsel - Selectivity of Boolean variable.
*
......@@ -2896,123 +2516,33 @@ scalargejoinsel(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
}
/*
* patternjoinsel - Generic code for pattern-match join selectivity.
*/
static double
patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
{
/* For the moment we just punt. */
return negate ? (1.0 - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL;
}
/*
* regexeqjoinsel - Join selectivity of regular-expression pattern match.
* mergejoinscansel - Scan selectivity of merge join.
*
* A merge join will stop as soon as it exhausts either input stream.
* Therefore, if we can estimate the ranges of both input variables,
* we can estimate how much of the input will actually be read. This
* can have a considerable impact on the cost when using indexscans.
*
* Also, we can estimate how much of each input has to be read before the
* first join pair is found, which will affect the join's startup time.
*
* clause should be a clause already known to be mergejoinable. opfamily,
* strategy, and nulls_first specify the sort ordering being used.
*
* The outputs are:
* *leftstart is set to the fraction of the left-hand variable expected
* to be scanned before the first join pair is found (0 to 1).
* *leftend is set to the fraction of the left-hand variable expected
* to be scanned before the join terminates (0 to 1).
* *rightstart, *rightend similarly for the right-hand variable.
*/
Datum
regexeqjoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, false));
}
/*
* icregexeqjoinsel - Join selectivity of case-insensitive regex match.
*/
Datum
icregexeqjoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, false));
}
/*
* likejoinsel - Join selectivity of LIKE pattern match.
*/
Datum
likejoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, false));
}
/*
* prefixjoinsel - Join selectivity of prefix operator
*/
Datum
prefixjoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Prefix, false));
}
/*
* iclikejoinsel - Join selectivity of ILIKE pattern match.
*/
Datum
iclikejoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, false));
}
/*
* regexnejoinsel - Join selectivity of regex non-match.
*/
Datum
regexnejoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, true));
}
/*
* icregexnejoinsel - Join selectivity of case-insensitive regex non-match.
*/
Datum
icregexnejoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, true));
}
/*
* nlikejoinsel - Join selectivity of LIKE pattern non-match.
*/
Datum
nlikejoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, true));
}
/*
* icnlikejoinsel - Join selectivity of ILIKE pattern non-match.
*/
Datum
icnlikejoinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, true));
}
/*
* mergejoinscansel - Scan selectivity of merge join.
*
* A merge join will stop as soon as it exhausts either input stream.
* Therefore, if we can estimate the ranges of both input variables,
* we can estimate how much of the input will actually be read. This
* can have a considerable impact on the cost when using indexscans.
*
* Also, we can estimate how much of each input has to be read before the
* first join pair is found, which will affect the join's startup time.
*
* clause should be a clause already known to be mergejoinable. opfamily,
* strategy, and nulls_first specify the sort ordering being used.
*
* The outputs are:
* *leftstart is set to the fraction of the left-hand variable expected
* to be scanned before the first join pair is found (0 to 1).
* *leftend is set to the fraction of the left-hand variable expected
* to be scanned before the join terminates (0 to 1).
* *rightstart, *rightend similarly for the right-hand variable.
*/
void
mergejoinscansel(PlannerInfo *root, Node *clause,
Oid opfamily, int strategy, bool nulls_first,
Selectivity *leftstart, Selectivity *leftend,
Selectivity *rightstart, Selectivity *rightend)
void
mergejoinscansel(PlannerInfo *root, Node *clause,
Oid opfamily, int strategy, bool nulls_first,
Selectivity *leftstart, Selectivity *leftend,
Selectivity *rightstart, Selectivity *rightend)
{
Node *left,
*right;
......@@ -5716,853 +5246,6 @@ find_join_input_rel(PlannerInfo *root, Relids relids)
}
/*-------------------------------------------------------------------------
*
* Pattern analysis functions
*
* These routines support analysis of LIKE and regular-expression patterns
* by the planner/optimizer. It's important that they agree with the
* regular-expression code in backend/regex/ and the LIKE code in
* backend/utils/adt/like.c. Also, the computation of the fixed prefix
* must be conservative: if we report a string longer than the true fixed
* prefix, the query may produce actually wrong answers, rather than just
* getting a bad selectivity estimate!
*
* Note that the prefix-analysis functions are called from
* backend/optimizer/path/indxpath.c as well as from routines in this file.
*
*-------------------------------------------------------------------------
*/
/*
* Check whether char is a letter (and, hence, subject to case-folding)
*
* In multibyte character sets or with ICU, we can't use isalpha, and it does not seem
* worth trying to convert to wchar_t to use iswalpha. Instead, just assume
* any multibyte char is potentially case-varying.
*/
static int
pattern_char_isalpha(char c, bool is_multibyte,
pg_locale_t locale, bool locale_is_c)
{
if (locale_is_c)
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
else if (is_multibyte && IS_HIGHBIT_SET(c))
return true;
else if (locale && locale->provider == COLLPROVIDER_ICU)
return IS_HIGHBIT_SET(c) ? true : false;
#ifdef HAVE_LOCALE_T
else if (locale && locale->provider == COLLPROVIDER_LIBC)
return isalpha_l((unsigned char) c, locale->info.lt);
#endif
else
return isalpha((unsigned char) c);
}
/*
* Extract the fixed prefix, if any, for a pattern.
*
* *prefix is set to a palloc'd prefix string (in the form of a Const node),
* or to NULL if no fixed prefix exists for the pattern.
* If rest_selec is not NULL, *rest_selec is set to an estimate of the
* selectivity of the remainder of the pattern (without any fixed prefix).
* The prefix Const has the same type (TEXT or BYTEA) as the input pattern.
*
* The return value distinguishes no fixed prefix, a partial prefix,
* or an exact-match-only pattern.
*/
static Pattern_Prefix_Status
like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
Const **prefix_const, Selectivity *rest_selec)
{
char *match;
char *patt;
int pattlen;
Oid typeid = patt_const->consttype;
int pos,
match_pos;
bool is_multibyte = (pg_database_encoding_max_length() > 1);
pg_locale_t locale = 0;
bool locale_is_c = false;
/* the right-hand const is type text or bytea */
Assert(typeid == BYTEAOID || typeid == TEXTOID);
if (case_insensitive)
{
if (typeid == BYTEAOID)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("case insensitive matching not supported on type bytea")));
/* If case-insensitive, we need locale info */
if (lc_ctype_is_c(collation))
locale_is_c = true;
else if (collation != DEFAULT_COLLATION_OID)
{
if (!OidIsValid(collation))
{
/*
* This typically means that the parser could not resolve a
* conflict of implicit collations, so report it that way.
*/
ereport(ERROR,
(errcode(ERRCODE_INDETERMINATE_COLLATION),
errmsg("could not determine which collation to use for ILIKE"),
errhint("Use the COLLATE clause to set the collation explicitly.")));
}
locale = pg_newlocale_from_collation(collation);
}
}
if (typeid != BYTEAOID)
{
patt = TextDatumGetCString(patt_const->constvalue);
pattlen = strlen(patt);
}
else
{
bytea *bstr = DatumGetByteaPP(patt_const->constvalue);
pattlen = VARSIZE_ANY_EXHDR(bstr);
patt = (char *) palloc(pattlen);
memcpy(patt, VARDATA_ANY(bstr), pattlen);
Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue));
}
match = palloc(pattlen + 1);
match_pos = 0;
for (pos = 0; pos < pattlen; pos++)
{
/* % and _ are wildcard characters in LIKE */
if (patt[pos] == '%' ||
patt[pos] == '_')
break;
/* Backslash escapes the next character */
if (patt[pos] == '\\')
{
pos++;
if (pos >= pattlen)
break;
}
/* Stop if case-varying character (it's sort of a wildcard) */
if (case_insensitive &&
pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
break;
match[match_pos++] = patt[pos];
}
match[match_pos] = '\0';
if (typeid != BYTEAOID)
*prefix_const = string_to_const(match, typeid);
else
*prefix_const = string_to_bytea_const(match, match_pos);
if (rest_selec != NULL)
*rest_selec = like_selectivity(&patt[pos], pattlen - pos,
case_insensitive);
pfree(patt);
pfree(match);
/* in LIKE, an empty pattern is an exact match! */
if (pos == pattlen)
return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
if (match_pos > 0)
return Pattern_Prefix_Partial;
return Pattern_Prefix_None;
}
static Pattern_Prefix_Status
regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
Const **prefix_const, Selectivity *rest_selec)
{
Oid typeid = patt_const->consttype;
char *prefix;
bool exact;
/*
* Should be unnecessary, there are no bytea regex operators defined. As
* such, it should be noted that the rest of this function has *not* been
* made safe for binary (possibly NULL containing) strings.
*/
if (typeid == BYTEAOID)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("regular-expression matching not supported on type bytea")));
/* Use the regexp machinery to extract the prefix, if any */
prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
case_insensitive, collation,
&exact);
if (prefix == NULL)
{
*prefix_const = NULL;
if (rest_selec != NULL)
{
char *patt = TextDatumGetCString(patt_const->constvalue);
*rest_selec = regex_selectivity(patt, strlen(patt),
case_insensitive,
0);
pfree(patt);
}
return Pattern_Prefix_None;
}
*prefix_const = string_to_const(prefix, typeid);
if (rest_selec != NULL)
{
if (exact)
{
/* Exact match, so there's no additional selectivity */
*rest_selec = 1.0;
}
else
{
char *patt = TextDatumGetCString(patt_const->constvalue);
*rest_selec = regex_selectivity(patt, strlen(patt),
case_insensitive,
strlen(prefix));
pfree(patt);
}
}
pfree(prefix);
if (exact)
return Pattern_Prefix_Exact; /* pattern specifies exact match */
else
return Pattern_Prefix_Partial;
}
Pattern_Prefix_Status
pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
Const **prefix, Selectivity *rest_selec)
{
Pattern_Prefix_Status result;
switch (ptype)
{
case Pattern_Type_Like:
result = like_fixed_prefix(patt, false, collation,
prefix, rest_selec);
break;
case Pattern_Type_Like_IC:
result = like_fixed_prefix(patt, true, collation,
prefix, rest_selec);
break;
case Pattern_Type_Regex:
result = regex_fixed_prefix(patt, false, collation,
prefix, rest_selec);
break;
case Pattern_Type_Regex_IC:
result = regex_fixed_prefix(patt, true, collation,
prefix, rest_selec);
break;
case Pattern_Type_Prefix:
/* Prefix type work is trivial. */
result = Pattern_Prefix_Partial;
*rest_selec = 1.0; /* all */
*prefix = makeConst(patt->consttype,
patt->consttypmod,
patt->constcollid,
patt->constlen,
datumCopy(patt->constvalue,
patt->constbyval,
patt->constlen),
patt->constisnull,
patt->constbyval);
break;
default:
elog(ERROR, "unrecognized ptype: %d", (int) ptype);
result = Pattern_Prefix_None; /* keep compiler quiet */
break;
}
return result;
}
/*
* Estimate the selectivity of a fixed prefix for a pattern match.
*
* A fixed prefix "foo" is estimated as the selectivity of the expression
* "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c).
*
* The selectivity estimate is with respect to the portion of the column
* population represented by the histogram --- the caller must fold this
* together with info about MCVs and NULLs.
*
* We use the >= and < operators from the specified btree opfamily to do the
* estimation. The given variable and Const must be of the associated
* datatype.
*
* XXX Note: we make use of the upper bound to estimate operator selectivity
* even if the locale is such that we cannot rely on the upper-bound string.
* The selectivity only needs to be approximately right anyway, so it seems
* more useful to use the upper-bound code than not.
*/
static Selectivity
prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
Oid vartype, Oid opfamily, Const *prefixcon)
{
Selectivity prefixsel;
Oid cmpopr;
FmgrInfo opproc;
AttStatsSlot sslot;
Const *greaterstrcon;
Selectivity eq_sel;
cmpopr = get_opfamily_member(opfamily, vartype, vartype,
BTGreaterEqualStrategyNumber);
if (cmpopr == InvalidOid)
elog(ERROR, "no >= operator for opfamily %u", opfamily);
fmgr_info(get_opcode(cmpopr), &opproc);
prefixsel = ineq_histogram_selectivity(root, vardata,
&opproc, true, true,
prefixcon->constvalue,
prefixcon->consttype);
if (prefixsel < 0.0)
{
/* No histogram is present ... return a suitable default estimate */
return DEFAULT_MATCH_SEL;
}
/*-------
* If we can create a string larger than the prefix, say
* "x < greaterstr". We try to generate the string referencing the
* collation of the var's statistics, but if that's not available,
* use DEFAULT_COLLATION_OID.
*-------
*/
if (HeapTupleIsValid(vardata->statsTuple) &&
get_attstatsslot(&sslot, vardata->statsTuple,
STATISTIC_KIND_HISTOGRAM, InvalidOid, 0))
/* sslot.stacoll is set up */ ;
else
sslot.stacoll = DEFAULT_COLLATION_OID;
cmpopr = get_opfamily_member(opfamily, vartype, vartype,
BTLessStrategyNumber);
if (cmpopr == InvalidOid)
elog(ERROR, "no < operator for opfamily %u", opfamily);
fmgr_info(get_opcode(cmpopr), &opproc);
greaterstrcon = make_greater_string(prefixcon, &opproc, sslot.stacoll);
if (greaterstrcon)
{
Selectivity topsel;
topsel = ineq_histogram_selectivity(root, vardata,
&opproc, false, false,
greaterstrcon->constvalue,
greaterstrcon->consttype);
/* ineq_histogram_selectivity worked before, it shouldn't fail now */
Assert(topsel >= 0.0);
/*
* Merge the two selectivities in the same way as for a range query
* (see clauselist_selectivity()). Note that we don't need to worry
* about double-exclusion of nulls, since ineq_histogram_selectivity
* doesn't count those anyway.
*/
prefixsel = topsel + prefixsel - 1.0;
}
/*
* If the prefix is long then the two bounding values might be too close
* together for the histogram to distinguish them usefully, resulting in a
* zero estimate (plus or minus roundoff error). To avoid returning a
* ridiculously small estimate, compute the estimated selectivity for
* "variable = 'foo'", and clamp to that. (Obviously, the resultant
* estimate should be at least that.)
*
* We apply this even if we couldn't make a greater string. That case
* suggests that the prefix is near the maximum possible, and thus
* probably off the end of the histogram, and thus we probably got a very
* small estimate from the >= condition; so we still need to clamp.
*/
cmpopr = get_opfamily_member(opfamily, vartype, vartype,
BTEqualStrategyNumber);
if (cmpopr == InvalidOid)
elog(ERROR, "no = operator for opfamily %u", opfamily);
eq_sel = var_eq_const(vardata, cmpopr, prefixcon->constvalue,
false, true, false);
prefixsel = Max(prefixsel, eq_sel);
return prefixsel;
}
/*
* Estimate the selectivity of a pattern of the specified type.
* Note that any fixed prefix of the pattern will have been removed already,
* so actually we may be looking at just a fragment of the pattern.
*
* For now, we use a very simplistic approach: fixed characters reduce the
* selectivity a good deal, character ranges reduce it a little,
* wildcards (such as % for LIKE or .* for regex) increase it.
*/
#define FIXED_CHAR_SEL 0.20 /* about 1/5 */
#define CHAR_RANGE_SEL 0.25
#define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */
#define FULL_WILDCARD_SEL 5.0
#define PARTIAL_WILDCARD_SEL 2.0
static Selectivity
like_selectivity(const char *patt, int pattlen, bool case_insensitive)
{
Selectivity sel = 1.0;
int pos;
/* Skip any leading wildcard; it's already factored into initial sel */
for (pos = 0; pos < pattlen; pos++)
{
if (patt[pos] != '%' && patt[pos] != '_')
break;
}
for (; pos < pattlen; pos++)
{
/* % and _ are wildcard characters in LIKE */
if (patt[pos] == '%')
sel *= FULL_WILDCARD_SEL;
else if (patt[pos] == '_')
sel *= ANY_CHAR_SEL;
else if (patt[pos] == '\\')
{
/* Backslash quotes the next character */
pos++;
if (pos >= pattlen)
break;
sel *= FIXED_CHAR_SEL;
}
else
sel *= FIXED_CHAR_SEL;
}
/* Could get sel > 1 if multiple wildcards */
if (sel > 1.0)
sel = 1.0;
return sel;
}
static Selectivity
regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
{
Selectivity sel = 1.0;
int paren_depth = 0;
int paren_pos = 0; /* dummy init to keep compiler quiet */
int pos;
for (pos = 0; pos < pattlen; pos++)
{
if (patt[pos] == '(')
{
if (paren_depth == 0)
paren_pos = pos; /* remember start of parenthesized item */
paren_depth++;
}
else if (patt[pos] == ')' && paren_depth > 0)
{
paren_depth--;
if (paren_depth == 0)
sel *= regex_selectivity_sub(patt + (paren_pos + 1),
pos - (paren_pos + 1),
case_insensitive);
}
else if (patt[pos] == '|' && paren_depth == 0)
{
/*
* If unquoted | is present at paren level 0 in pattern, we have
* multiple alternatives; sum their probabilities.
*/
sel += regex_selectivity_sub(patt + (pos + 1),
pattlen - (pos + 1),
case_insensitive);
break; /* rest of pattern is now processed */
}
else if (patt[pos] == '[')
{
bool negclass = false;
if (patt[++pos] == '^')
{
negclass = true;
pos++;
}
if (patt[pos] == ']') /* ']' at start of class is not special */
pos++;
while (pos < pattlen && patt[pos] != ']')
pos++;
if (paren_depth == 0)
sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
}
else if (patt[pos] == '.')
{
if (paren_depth == 0)
sel *= ANY_CHAR_SEL;
}
else if (patt[pos] == '*' ||
patt[pos] == '?' ||
patt[pos] == '+')
{
/* Ought to be smarter about quantifiers... */
if (paren_depth == 0)
sel *= PARTIAL_WILDCARD_SEL;
}
else if (patt[pos] == '{')
{
while (pos < pattlen && patt[pos] != '}')
pos++;
if (paren_depth == 0)
sel *= PARTIAL_WILDCARD_SEL;
}
else if (patt[pos] == '\\')
{
/* backslash quotes the next character */
pos++;
if (pos >= pattlen)
break;
if (paren_depth == 0)
sel *= FIXED_CHAR_SEL;
}
else
{
if (paren_depth == 0)
sel *= FIXED_CHAR_SEL;
}
}
/* Could get sel > 1 if multiple wildcards */
if (sel > 1.0)
sel = 1.0;
return sel;
}
static Selectivity
regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
int fixed_prefix_len)
{
Selectivity sel;
/* If patt doesn't end with $, consider it to have a trailing wildcard */
if (pattlen > 0 && patt[pattlen - 1] == '$' &&
(pattlen == 1 || patt[pattlen - 2] != '\\'))
{
/* has trailing $ */
sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
}
else
{
/* no trailing $ */
sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
sel *= FULL_WILDCARD_SEL;
}
/* If there's a fixed prefix, discount its selectivity */
if (fixed_prefix_len > 0)
sel /= pow(FIXED_CHAR_SEL, fixed_prefix_len);
/* Make sure result stays in range */
CLAMP_PROBABILITY(sel);
return sel;
}
/*
* For bytea, the increment function need only increment the current byte
* (there are no multibyte characters to worry about).
*/
static bool
byte_increment(unsigned char *ptr, int len)
{
if (*ptr >= 255)
return false;
(*ptr)++;
return true;
}
/*
* Try to generate a string greater than the given string or any
* string it is a prefix of. If successful, return a palloc'd string
* in the form of a Const node; else return NULL.
*
* The caller must provide the appropriate "less than" comparison function
* for testing the strings, along with the collation to use.
*
* The key requirement here is that given a prefix string, say "foo",
* we must be able to generate another string "fop" that is greater than
* all strings "foobar" starting with "foo". We can test that we have
* generated a string greater than the prefix string, but in non-C collations
* that is not a bulletproof guarantee that an extension of the string might
* not sort after it; an example is that "foo " is less than "foo!", but it
* is not clear that a "dictionary" sort ordering will consider "foo!" less
* than "foo bar". CAUTION: Therefore, this function should be used only for
* estimation purposes when working in a non-C collation.
*
* To try to catch most cases where an extended string might otherwise sort
* before the result value, we determine which of the strings "Z", "z", "y",
* and "9" is seen as largest by the collation, and append that to the given
* prefix before trying to find a string that compares as larger.
*
* To search for a greater string, we repeatedly "increment" the rightmost
* character, using an encoding-specific character incrementer function.
* When it's no longer possible to increment the last character, we truncate
* off that character and start incrementing the next-to-rightmost.
* For example, if "z" were the last character in the sort order, then we
* could produce "foo" as a string greater than "fonz".
*
* This could be rather slow in the worst case, but in most cases we
* won't have to try more than one or two strings before succeeding.
*
* Note that it's important for the character incrementer not to be too anal
* about producing every possible character code, since in some cases the only
* way to get a larger string is to increment a previous character position.
* So we don't want to spend too much time trying every possible character
* code at the last position. A good rule of thumb is to be sure that we
* don't try more than 256*K values for a K-byte character (and definitely
* not 256^K, which is what an exhaustive search would approach).
*/
Const *
make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
{
Oid datatype = str_const->consttype;
char *workstr;
int len;
Datum cmpstr;
char *cmptxt = NULL;
mbcharacter_incrementer charinc;
/*
* Get a modifiable copy of the prefix string in C-string format, and set
* up the string we will compare to as a Datum. In C locale this can just
* be the given prefix string, otherwise we need to add a suffix. Type
* BYTEA sorts bytewise so it never needs a suffix either.
*/
if (datatype == BYTEAOID)
{
bytea *bstr = DatumGetByteaPP(str_const->constvalue);
len = VARSIZE_ANY_EXHDR(bstr);
workstr = (char *) palloc(len);
memcpy(workstr, VARDATA_ANY(bstr), len);
Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue));
cmpstr = str_const->constvalue;
}
else
{
if (datatype == NAMEOID)
workstr = DatumGetCString(DirectFunctionCall1(nameout,
str_const->constvalue));
else
workstr = TextDatumGetCString(str_const->constvalue);
len = strlen(workstr);
if (lc_collate_is_c(collation) || len == 0)
cmpstr = str_const->constvalue;
else
{
/* If first time through, determine the suffix to use */
static char suffixchar = 0;
static Oid suffixcollation = 0;
if (!suffixchar || suffixcollation != collation)
{
char *best;
best = "Z";
if (varstr_cmp(best, 1, "z", 1, collation) < 0)
best = "z";
if (varstr_cmp(best, 1, "y", 1, collation) < 0)
best = "y";
if (varstr_cmp(best, 1, "9", 1, collation) < 0)
best = "9";
suffixchar = *best;
suffixcollation = collation;
}
/* And build the string to compare to */
if (datatype == NAMEOID)
{
cmptxt = palloc(len + 2);
memcpy(cmptxt, workstr, len);
cmptxt[len] = suffixchar;
cmptxt[len + 1] = '\0';
cmpstr = PointerGetDatum(cmptxt);
}
else
{
cmptxt = palloc(VARHDRSZ + len + 1);
SET_VARSIZE(cmptxt, VARHDRSZ + len + 1);
memcpy(VARDATA(cmptxt), workstr, len);
*(VARDATA(cmptxt) + len) = suffixchar;
cmpstr = PointerGetDatum(cmptxt);
}
}
}
/* Select appropriate character-incrementer function */
if (datatype == BYTEAOID)
charinc = byte_increment;
else
charinc = pg_database_encoding_character_incrementer();
/* And search ... */
while (len > 0)
{
int charlen;
unsigned char *lastchar;
/* Identify the last character --- for bytea, just the last byte */
if (datatype == BYTEAOID)
charlen = 1;
else
charlen = len - pg_mbcliplen(workstr, len, len - 1);
lastchar = (unsigned char *) (workstr + len - charlen);
/*
* Try to generate a larger string by incrementing the last character
* (for BYTEA, we treat each byte as a character).
*
* Note: the incrementer function is expected to return true if it's
* generated a valid-per-the-encoding new character, otherwise false.
* The contents of the character on false return are unspecified.
*/
while (charinc(lastchar, charlen))
{
Const *workstr_const;
if (datatype == BYTEAOID)
workstr_const = string_to_bytea_const(workstr, len);
else
workstr_const = string_to_const(workstr, datatype);
if (DatumGetBool(FunctionCall2Coll(ltproc,
collation,
cmpstr,
workstr_const->constvalue)))
{
/* Successfully made a string larger than cmpstr */
if (cmptxt)
pfree(cmptxt);
pfree(workstr);
return workstr_const;
}
/* No good, release unusable value and try again */
pfree(DatumGetPointer(workstr_const->constvalue));
pfree(workstr_const);
}
/*
* No luck here, so truncate off the last character and try to
* increment the next one.
*/
len -= charlen;
workstr[len] = '\0';
}
/* Failed... */
if (cmptxt)
pfree(cmptxt);
pfree(workstr);
return NULL;
}
/*
* Generate a Datum of the appropriate type from a C string.
* Note that all of the supported types are pass-by-ref, so the
* returned value should be pfree'd if no longer needed.
*/
static Datum
string_to_datum(const char *str, Oid datatype)
{
Assert(str != NULL);
/*
* We cheat a little by assuming that CStringGetTextDatum() will do for
* bpchar and varchar constants too...
*/
if (datatype == NAMEOID)
return DirectFunctionCall1(namein, CStringGetDatum(str));
else if (datatype == BYTEAOID)
return DirectFunctionCall1(byteain, CStringGetDatum(str));
else
return CStringGetTextDatum(str);
}
/*
* Generate a Const node of the appropriate type from a C string.
*/
static Const *
string_to_const(const char *str, Oid datatype)
{
Datum conval = string_to_datum(str, datatype);
Oid collation;
int constlen;
/*
* We only need to support a few datatypes here, so hard-wire properties
* instead of incurring the expense of catalog lookups.
*/
switch (datatype)
{
case TEXTOID:
case VARCHAROID:
case BPCHAROID:
collation = DEFAULT_COLLATION_OID;
constlen = -1;
break;
case NAMEOID:
collation = C_COLLATION_OID;
constlen = NAMEDATALEN;
break;
case BYTEAOID:
collation = InvalidOid;
constlen = -1;
break;
default:
elog(ERROR, "unexpected datatype in string_to_const: %u",
datatype);
return NULL;
}
return makeConst(datatype, -1, collation, constlen,
conval, false, false);
}
/*
* Generate a Const node of bytea type from a binary C string and a length.
*/
static Const *
string_to_bytea_const(const char *str, size_t str_len)
{
bytea *bstr = palloc(VARHDRSZ + str_len);
Datum conval;
memcpy(VARDATA(bstr), str, str_len);
SET_VARSIZE(bstr, VARHDRSZ + str_len);
conval = PointerGetDatum(bstr);
return makeConst(BYTEAOID, -1, InvalidOid, -1, conval, false, false);
}
/*-------------------------------------------------------------------------
*
* Index cost estimation functions
......
......@@ -15,7 +15,6 @@
#ifndef SELFUNCS_H
#define SELFUNCS_H
#include "fmgr.h"
#include "access/htup.h"
#include "nodes/pathnodes.h"
......@@ -85,20 +84,6 @@ typedef struct VariableStatData
} while(0)
typedef enum
{
Pattern_Type_Like,
Pattern_Type_Like_IC,
Pattern_Type_Regex,
Pattern_Type_Regex_IC,
Pattern_Type_Prefix
} Pattern_Type;
typedef enum
{
Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact
} Pattern_Prefix_Status;
/*
* deconstruct_indexquals is a simple function to examine the indexquals
* attached to a proposed IndexPath. It returns a list of IndexQualInfo
......@@ -175,14 +160,16 @@ extern double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
Datum constval, bool varonleft,
int min_hist_size, int n_skip,
int *hist_size);
extern Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
Pattern_Type ptype,
Oid collation,
Const **prefix,
Selectivity *rest_selec);
extern Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc,
Oid collation);
extern double ineq_histogram_selectivity(PlannerInfo *root,
VariableStatData *vardata,
FmgrInfo *opproc, bool isgt, bool iseq,
Datum constval, Oid consttype);
extern double var_eq_const(VariableStatData *vardata, Oid oproid,
Datum constval, bool constisnull,
bool varonleft, bool negate);
extern double var_eq_non_const(VariableStatData *vardata, Oid oproid,
Node *other,
bool varonleft, bool negate);
extern Selectivity boolvarsel(PlannerInfo *root, Node *arg, int varRelid);
extern Selectivity booltestsel(PlannerInfo *root, BoolTestType booltesttype,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment