Commit c66e4f13 authored by Tom Lane's avatar Tom Lane

Improve GiST range-contained-by searches by adding a flag for empty ranges.

In the original implementation, a range-contained-by search had to scan
the entire index because an empty range could be lurking anywhere.
Improve that by adding a flag to upper GiST entries that says whether the
represented subtree contains any empty ranges.

Also, make a simple mod to the penalty function to discourage empty ranges
from getting pushed into subtrees without any.  This needs more work, and
the picksplit function should be taught about it too, but that code can be
improved without causing an on-disk compatibility break; so we'll leave it
for another day.

Since we're breaking on-disk compatibility of range values anyway, I took
the opportunity to reorganize the range flags bits; the unused
RANGE_xB_NULL bits are now adjacent, which might open the door for using
them in some other way later.

In passing, remove the GiST range opclass entry for <>, which doesn't seem
like it can really be indexed usefully.

Alexander Korotkov, with some editorializing by Tom
parent 08da2d28
...@@ -1622,6 +1622,24 @@ range_get_flags(RangeType *range) ...@@ -1622,6 +1622,24 @@ range_get_flags(RangeType *range)
return *((char *) range + VARSIZE(range) - 1); return *((char *) range + VARSIZE(range) - 1);
} }
/*
* range_set_contain_empty: set the RANGE_CONTAIN_EMPTY bit in the value.
*
* This is only needed in GiST operations, so we don't include a provision
* for setting it in range_serialize; rather, this function must be applied
* afterwards.
*/
void
range_set_contain_empty(RangeType *range)
{
char *flagsp;
/* flag byte is datum's last byte */
flagsp = (char *) range + VARSIZE(range) - 1;
*flagsp |= RANGE_CONTAIN_EMPTY;
}
/* /*
* This both serializes and canonicalizes (if applicable) the range. * This both serializes and canonicalizes (if applicable) the range.
* This should be used by most callers. * This should be used by most callers.
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "access/gist.h" #include "access/gist.h"
#include "access/skey.h" #include "access/skey.h"
#include "utils/builtins.h" #include "utils/builtins.h"
#include "utils/datum.h"
#include "utils/rangetypes.h" #include "utils/rangetypes.h"
...@@ -32,7 +33,11 @@ ...@@ -32,7 +33,11 @@
#define RANGESTRAT_CONTAINED_BY 8 #define RANGESTRAT_CONTAINED_BY 8
#define RANGESTRAT_CONTAINS_ELEM 16 #define RANGESTRAT_CONTAINS_ELEM 16
#define RANGESTRAT_EQ 18 #define RANGESTRAT_EQ 18
#define RANGESTRAT_NE 19
/* Copy a RangeType datum (hardwires typbyval and typlen for ranges...) */
#define rangeCopy(r) \
((RangeType *) DatumGetPointer(datumCopy(PointerGetDatum(r), \
false, -1)))
/* /*
* Auxiliary structure for picksplit method. * Auxiliary structure for picksplit method.
...@@ -145,6 +150,16 @@ range_gist_penalty(PG_FUNCTION_ARGS) ...@@ -145,6 +150,16 @@ range_gist_penalty(PG_FUNCTION_ARGS)
subtype_diff = &typcache->rng_subdiff_finfo; subtype_diff = &typcache->rng_subdiff_finfo;
/*
* If new is or contains empty, and orig doesn't, apply infinite penalty.
* We really don't want to pollute an empty-free subtree with empties.
*/
if (RangeIsOrContainsEmpty(new) && !RangeIsOrContainsEmpty(orig))
{
*penalty = get_float4_infinity();
PG_RETURN_POINTER(penalty);
}
/* /*
* We want to compare the size of "orig" to size of "orig union new". * We want to compare the size of "orig" to size of "orig union new".
* The penalty will be the sum of the reduction in the lower bound plus * The penalty will be the sum of the reduction in the lower bound plus
...@@ -163,31 +178,10 @@ range_gist_penalty(PG_FUNCTION_ARGS) ...@@ -163,31 +178,10 @@ range_gist_penalty(PG_FUNCTION_ARGS)
} }
else if (empty1) else if (empty1)
{ {
if (lower2.infinite || upper2.infinite) /* infinite penalty for pushing non-empty into all-empty subtree */
{
/* from empty to infinite */
*penalty = get_float4_infinity(); *penalty = get_float4_infinity();
PG_RETURN_POINTER(penalty); PG_RETURN_POINTER(penalty);
} }
else if (OidIsValid(subtype_diff->fn_oid))
{
/* from empty to upper2-lower2 */
*penalty = DatumGetFloat8(FunctionCall2Coll(subtype_diff,
typcache->rng_collation,
upper2.val,
lower2.val));
/* upper2 must be >= lower2 */
if (*penalty < 0)
*penalty = 0; /* subtype_diff is broken */
PG_RETURN_POINTER(penalty);
}
else
{
/* wild guess */
*penalty = 1.0;
PG_RETURN_POINTER(penalty);
}
}
/* if orig isn't empty, s_union can't be either */ /* if orig isn't empty, s_union can't be either */
Assert(!empty2); Assert(!empty2);
...@@ -334,15 +328,27 @@ range_gist_picksplit(PG_FUNCTION_ARGS) ...@@ -334,15 +328,27 @@ range_gist_picksplit(PG_FUNCTION_ARGS)
Datum Datum
range_gist_same(PG_FUNCTION_ARGS) range_gist_same(PG_FUNCTION_ARGS)
{ {
/* Datum r1 = PG_GETARG_DATUM(0); */ RangeType *r1 = PG_GETARG_RANGE(0);
/* Datum r2 = PG_GETARG_DATUM(1); */ RangeType *r2 = PG_GETARG_RANGE(1);
bool *result = (bool *) PG_GETARG_POINTER(2); bool *result = (bool *) PG_GETARG_POINTER(2);
/* /*
* We can safely call range_eq using our fcinfo directly; it won't notice * range_eq will ignore the RANGE_CONTAIN_EMPTY flag, so we have to
* the third argument. This allows it to use fn_extra for caching. * check that for ourselves. More generally, if the entries have been
* properly normalized, then unequal flags bytes must mean unequal ranges
* ... so let's just test all the flag bits at once.
*/
if (range_get_flags(r1) != range_get_flags(r2))
*result = false;
else
{
/*
* We can safely call range_eq using our fcinfo directly; it won't
* notice the third argument. This allows it to use fn_extra for
* caching.
*/ */
*result = DatumGetBool(range_eq(fcinfo)); *result = DatumGetBool(range_eq(fcinfo));
}
PG_RETURN_POINTER(result); PG_RETURN_POINTER(result);
} }
...@@ -356,27 +362,53 @@ range_gist_same(PG_FUNCTION_ARGS) ...@@ -356,27 +362,53 @@ range_gist_same(PG_FUNCTION_ARGS)
/* /*
* Return the smallest range that contains r1 and r2 * Return the smallest range that contains r1 and r2
* *
* XXX would it be better to redefine range_union as working this way? * This differs from regular range_union in two critical ways:
* 1. It won't throw an error for non-adjacent r1 and r2, but just absorb
* the intervening values into the result range.
* 2. We track whether any empty range has been union'd into the result,
* so that contained_by searches can be indexed. Note that this means
* that *all* unions formed within the GiST index must go through here.
*/ */
static RangeType * static RangeType *
range_super_union(TypeCacheEntry *typcache, RangeType * r1, RangeType * r2) range_super_union(TypeCacheEntry *typcache, RangeType * r1, RangeType * r2)
{ {
RangeType *result;
RangeBound lower1, RangeBound lower1,
lower2; lower2;
RangeBound upper1, RangeBound upper1,
upper2; upper2;
bool empty1, bool empty1,
empty2; empty2;
char flags1,
flags2;
RangeBound *result_lower; RangeBound *result_lower;
RangeBound *result_upper; RangeBound *result_upper;
range_deserialize(typcache, r1, &lower1, &upper1, &empty1); range_deserialize(typcache, r1, &lower1, &upper1, &empty1);
range_deserialize(typcache, r2, &lower2, &upper2, &empty2); range_deserialize(typcache, r2, &lower2, &upper2, &empty2);
flags1 = range_get_flags(r1);
flags2 = range_get_flags(r2);
if (empty1) if (empty1)
{
/* We can return r2 as-is if it already is or contains empty */
if (flags2 & (RANGE_EMPTY | RANGE_CONTAIN_EMPTY))
return r2;
/* Else we'd better copy it (modify-in-place isn't safe) */
r2 = rangeCopy(r2);
range_set_contain_empty(r2);
return r2; return r2;
}
if (empty2) if (empty2)
{
/* We can return r1 as-is if it already is or contains empty */
if (flags1 & (RANGE_EMPTY | RANGE_CONTAIN_EMPTY))
return r1;
/* Else we'd better copy it (modify-in-place isn't safe) */
r1 = rangeCopy(r1);
range_set_contain_empty(r1);
return r1; return r1;
}
if (range_cmp_bounds(typcache, &lower1, &lower2) <= 0) if (range_cmp_bounds(typcache, &lower1, &lower2) <= 0)
result_lower = &lower1; result_lower = &lower1;
...@@ -389,12 +421,19 @@ range_super_union(TypeCacheEntry *typcache, RangeType * r1, RangeType * r2) ...@@ -389,12 +421,19 @@ range_super_union(TypeCacheEntry *typcache, RangeType * r1, RangeType * r2)
result_upper = &upper2; result_upper = &upper2;
/* optimization to avoid constructing a new range */ /* optimization to avoid constructing a new range */
if (result_lower == &lower1 && result_upper == &upper1) if (result_lower == &lower1 && result_upper == &upper1 &&
((flags1 & RANGE_CONTAIN_EMPTY) || !(flags2 & RANGE_CONTAIN_EMPTY)))
return r1; return r1;
if (result_lower == &lower2 && result_upper == &upper2) if (result_lower == &lower2 && result_upper == &upper2 &&
((flags2 & RANGE_CONTAIN_EMPTY) || !(flags1 & RANGE_CONTAIN_EMPTY)))
return r2; return r2;
return make_range(typcache, result_lower, result_upper, false); result = make_range(typcache, result_lower, result_upper, false);
if ((flags1 & RANGE_CONTAIN_EMPTY) || (flags2 & RANGE_CONTAIN_EMPTY))
range_set_contain_empty(result);
return result;
} }
/* /*
...@@ -484,21 +523,26 @@ range_gist_consistent_int(FmgrInfo *flinfo, StrategyNumber strategy, ...@@ -484,21 +523,26 @@ range_gist_consistent_int(FmgrInfo *flinfo, StrategyNumber strategy,
break; break;
case RANGESTRAT_CONTAINED_BY: case RANGESTRAT_CONTAINED_BY:
/* /*
* Ideally we'd apply range_overlaps here, but at present it * Empty ranges are contained by anything, so if key is or
* might fail to find empty ranges in the index, which should * contains any empty ranges, we must descend into it. Otherwise,
* be reported as being contained by anything. This needs work. * descend only if key overlaps the query.
*/ */
if (RangeIsOrContainsEmpty(key))
return true; return true;
proc = range_overlaps;
break; break;
case RANGESTRAT_CONTAINS_ELEM: case RANGESTRAT_CONTAINS_ELEM:
proc = range_contains_elem; proc = range_contains_elem;
break; break;
case RANGESTRAT_EQ: case RANGESTRAT_EQ:
/*
* If query is empty, descend only if the key is or contains any
* empty ranges. Otherwise, descend if key contains query.
*/
if (RangeIsEmpty(DatumGetRangeType(query)))
return RangeIsOrContainsEmpty(key);
proc = range_contains; proc = range_contains;
break; break;
case RANGESTRAT_NE:
return true;
break;
default: default:
elog(ERROR, "unrecognized range strategy: %d", strategy); elog(ERROR, "unrecognized range strategy: %d", strategy);
proc = NULL; /* keep compiler quiet */ proc = NULL; /* keep compiler quiet */
...@@ -555,9 +599,6 @@ range_gist_consistent_leaf(FmgrInfo *flinfo, StrategyNumber strategy, ...@@ -555,9 +599,6 @@ range_gist_consistent_leaf(FmgrInfo *flinfo, StrategyNumber strategy,
case RANGESTRAT_EQ: case RANGESTRAT_EQ:
proc = range_eq; proc = range_eq;
break; break;
case RANGESTRAT_NE:
proc = range_ne;
break;
default: default:
elog(ERROR, "unrecognized range strategy: %d", strategy); elog(ERROR, "unrecognized range strategy: %d", strategy);
proc = NULL; /* keep compiler quiet */ proc = NULL; /* keep compiler quiet */
......
...@@ -53,6 +53,6 @@ ...@@ -53,6 +53,6 @@
*/ */
/* yyyymmddN */ /* yyyymmddN */
#define CATALOG_VERSION_NO 201111241 #define CATALOG_VERSION_NO 201111271
#endif #endif
...@@ -736,6 +736,5 @@ DATA(insert ( 3919 3831 3831 7 s 3890 783 0 )); ...@@ -736,6 +736,5 @@ DATA(insert ( 3919 3831 3831 7 s 3890 783 0 ));
DATA(insert ( 3919 3831 3831 8 s 3892 783 0 )); DATA(insert ( 3919 3831 3831 8 s 3892 783 0 ));
DATA(insert ( 3919 3831 2283 16 s 3889 783 0 )); DATA(insert ( 3919 3831 2283 16 s 3889 783 0 ));
DATA(insert ( 3919 3831 3831 18 s 3882 783 0 )); DATA(insert ( 3919 3831 3831 18 s 3882 783 0 ));
DATA(insert ( 3919 3831 3831 19 s 3883 783 0 ));
#endif /* PG_AMOP_H */ #endif /* PG_AMOP_H */
...@@ -34,12 +34,14 @@ typedef struct ...@@ -34,12 +34,14 @@ typedef struct
/* A range's flags byte contains these bits: */ /* A range's flags byte contains these bits: */
#define RANGE_EMPTY 0x01 /* range is empty */ #define RANGE_EMPTY 0x01 /* range is empty */
#define RANGE_LB_INC 0x02 /* lower bound is inclusive (vs exclusive) */ #define RANGE_LB_INC 0x02 /* lower bound is inclusive */
#define RANGE_LB_NULL 0x04 /* lower bound is null (NOT CURRENTLY USED) */ #define RANGE_UB_INC 0x04 /* upper bound is inclusive */
#define RANGE_LB_INF 0x08 /* lower bound is +/- infinity */ #define RANGE_LB_INF 0x08 /* lower bound is -infinity */
#define RANGE_UB_INC 0x10 /* upper bound is inclusive (vs exclusive) */ #define RANGE_UB_INF 0x10 /* upper bound is +infinity */
#define RANGE_UB_NULL 0x20 /* upper bound is null (NOT CURRENTLY USED) */ #define RANGE_LB_NULL 0x20 /* lower bound is null (NOT USED) */
#define RANGE_UB_INF 0x40 /* upper bound is +/- infinity */ #define RANGE_UB_NULL 0x40 /* upper bound is null (NOT USED) */
#define RANGE_CONTAIN_EMPTY 0x80 /* marks a GiST internal-page entry whose
* subtree contains some empty ranges */
#define RANGE_HAS_LBOUND(flags) (!((flags) & (RANGE_EMPTY | \ #define RANGE_HAS_LBOUND(flags) (!((flags) & (RANGE_EMPTY | \
RANGE_LB_NULL | \ RANGE_LB_NULL | \
...@@ -49,7 +51,9 @@ typedef struct ...@@ -49,7 +51,9 @@ typedef struct
RANGE_UB_NULL | \ RANGE_UB_NULL | \
RANGE_UB_INF))) RANGE_UB_INF)))
#define RangeIsEmpty(r) (range_get_flags(r) & RANGE_EMPTY) #define RangeIsEmpty(r) ((range_get_flags(r) & RANGE_EMPTY) != 0)
#define RangeIsOrContainsEmpty(r) \
((range_get_flags(r) & (RANGE_EMPTY | RANGE_CONTAIN_EMPTY)) != 0)
/* Internal representation of either bound of a range (not what's on disk) */ /* Internal representation of either bound of a range (not what's on disk) */
...@@ -152,6 +156,7 @@ extern void range_deserialize(TypeCacheEntry *typcache, RangeType *range, ...@@ -152,6 +156,7 @@ extern void range_deserialize(TypeCacheEntry *typcache, RangeType *range,
RangeBound *lower, RangeBound *upper, RangeBound *lower, RangeBound *upper,
bool *empty); bool *empty);
extern char range_get_flags(RangeType *range); extern char range_get_flags(RangeType *range);
extern void range_set_contain_empty(RangeType *range);
extern RangeType *make_range(TypeCacheEntry *typcache, RangeBound *lower, extern RangeType *make_range(TypeCacheEntry *typcache, RangeBound *lower,
RangeBound *upper, bool empty); RangeBound *upper, bool empty);
extern int range_cmp_bounds(TypeCacheEntry *typcache, RangeBound *b1, extern int range_cmp_bounds(TypeCacheEntry *typcache, RangeBound *b1,
......
...@@ -1041,7 +1041,6 @@ ORDER BY 1, 2, 3; ...@@ -1041,7 +1041,6 @@ ORDER BY 1, 2, 3;
783 | 15 | <-> 783 | 15 | <->
783 | 16 | @> 783 | 16 | @>
783 | 18 | = 783 | 18 | =
783 | 19 | <>
783 | 27 | @> 783 | 27 | @>
783 | 28 | <@ 783 | 28 | <@
783 | 47 | @> 783 | 47 | @>
...@@ -1054,7 +1053,7 @@ ORDER BY 1, 2, 3; ...@@ -1054,7 +1053,7 @@ ORDER BY 1, 2, 3;
2742 | 2 | @@@ 2742 | 2 | @@@
2742 | 3 | <@ 2742 | 3 | <@
2742 | 4 | = 2742 | 4 | =
(44 rows) (43 rows)
-- Check that all opclass search operators have selectivity estimators. -- Check that all opclass search operators have selectivity estimators.
-- This is not absolutely required, but it seems a reasonable thing -- This is not absolutely required, but it seems a reasonable thing
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment