Commit 489247b0 authored by Alvaro Herrera's avatar Alvaro Herrera

Improve pruning of a default partition

When querying a partitioned table containing a default partition, we
were wrongly deciding to include it in the scan too early in the
process, failing to exclude it in some cases.  If we reinterpret the
PruneStepResult.scan_default flag slightly, we can do a better job at
detecting that it can be excluded.  The change is that we avoid setting
the flag for that pruning step unless the step absolutely requires the
default partition to be scanned (in contrast with the previous
arrangement, which was to set it unless the step was able to prune it).
So get_matching_partitions() must explicitly check the partition that
each returned bound value corresponds to in order to determine whether
the default one needs to be included, rather than relying on the flag
from the final step result.

Author: Yuzuko Hosoya <hosoya.yuzuko@lab.ntt.co.jp>
Reviewed-by: default avatarAmit Langote <Langote_Amit_f8@lab.ntt.co.jp>
Discussion: https://postgr.es/m/00e601d4ca86$932b8bc0$b982a340$@lab.ntt.co.jp
parent 69edf4f8
......@@ -735,6 +735,7 @@ get_matching_partitions(PartitionPruneContext *context, List *pruning_steps)
PruneStepResult **results,
*final_result;
ListCell *lc;
bool scan_default;
/* If there are no pruning steps then all partitions match. */
if (num_steps == 0)
......@@ -786,30 +787,39 @@ get_matching_partitions(PartitionPruneContext *context, List *pruning_steps)
Assert(final_result != NULL);
i = -1;
result = NULL;
scan_default = final_result->scan_default;
while ((i = bms_next_member(final_result->bound_offsets, i)) >= 0)
{
int partindex = context->boundinfo->indexes[i];
/*
* In range and hash partitioning cases, some slots may contain -1,
* indicating that no partition has been defined to accept a given
* range of data or for a given remainder, respectively. The default
* partition, if any, in case of range partitioning, will be added to
* the result, because the specified range still satisfies the query's
* conditions.
*/
if (partindex >= 0)
result = bms_add_member(result, partindex);
if (partindex < 0)
{
/*
* In range partitioning cases, if a partition index is -1 it
* means that the bound at the offset is the upper bound for a
* range not covered by any partition (other than a possible
* default partition). In hash partitioning, the same means no
* partition has been defined for the corresponding remainder
* value.
*
* In either case, the value is still part of the queried range of
* values, so mark to scan the default partition if one exists.
*/
scan_default |= partition_bound_has_default(context->boundinfo);
continue;
}
result = bms_add_member(result, partindex);
}
/* Add the null and/or default partition if needed and if present. */
/* Add the null and/or default partition if needed and present. */
if (final_result->scan_null)
{
Assert(context->strategy == PARTITION_STRATEGY_LIST);
Assert(partition_bound_accepts_nulls(context->boundinfo));
result = bms_add_member(result, context->boundinfo->null_index);
}
if (final_result->scan_default)
if (scan_default)
{
Assert(context->strategy == PARTITION_STRATEGY_LIST ||
context->strategy == PARTITION_STRATEGY_RANGE);
......@@ -2438,6 +2448,11 @@ get_matching_hash_bounds(PartitionPruneContext *context,
* get_matching_list_bounds
* Determine the offsets of list bounds matching the specified value,
* according to the semantics of the given operator strategy
*
* scan_default will be set in the returned struct, if the default partition
* needs to be scanned, provided one exists at all. scan_null will be set if
* the special null-accepting partition needs to be scanned.
*
* 'opstrategy' if non-zero must be a btree strategy number.
*
* 'value' contains the value to use for pruning.
......@@ -2640,8 +2655,13 @@ get_matching_list_bounds(PartitionPruneContext *context,
* Each datum whose offset is in result is to be treated as the upper bound of
* the partition that will contain the desired values.
*
* If default partition needs to be scanned for given values, set scan_default
* in result if present.
* scan_default is set in the returned struct if a default partition exists
* and we're absolutely certain that it needs to be scanned. We do *not* set
* it just because values match portions of the key space uncovered by
* partitions other than default (space which we normally assume to belong to
* the default partition): the final set of bounds obtained after combining
* multiple pruning steps might exclude it, so we infer its inclusion
* elsewhere.
*
* 'opstrategy' if non-zero must be a btree strategy number.
*
......@@ -2667,8 +2687,7 @@ get_matching_range_bounds(PartitionPruneContext *context,
int *partindices = boundinfo->indexes;
int off,
minoff,
maxoff,
i;
maxoff;
bool is_equal;
bool inclusive = false;
......@@ -2698,13 +2717,15 @@ get_matching_range_bounds(PartitionPruneContext *context,
*/
if (nvalues == 0)
{
/* ignore key space not covered by any partitions */
if (partindices[minoff] < 0)
minoff++;
if (partindices[maxoff] < 0)
maxoff--;
result->scan_default = partition_bound_has_default(boundinfo);
Assert(minoff >= 0 && maxoff >= 0);
Assert(partindices[minoff] >= 0 &&
partindices[maxoff] >= 0);
result->bound_offsets = bms_add_range(NULL, minoff, maxoff);
return result;
......@@ -2732,11 +2753,7 @@ get_matching_range_bounds(PartitionPruneContext *context,
if (nvalues == partnatts)
{
/* There can only be zero or one matching partition. */
if (partindices[off + 1] >= 0)
result->bound_offsets = bms_make_singleton(off + 1);
else
result->scan_default =
partition_bound_has_default(boundinfo);
result->bound_offsets = bms_make_singleton(off + 1);
return result;
}
else
......@@ -2824,57 +2841,21 @@ get_matching_range_bounds(PartitionPruneContext *context,
maxoff = off + 1;
}
/*
* Skip if minoff/maxoff are actually the upper bound of a
* un-assigned portion of values.
*/
if (partindices[minoff] < 0 && minoff < boundinfo->ndatums)
minoff++;
if (partindices[maxoff] < 0 && maxoff >= 1)
maxoff--;
/*
* There may exist a range of values unassigned to any
* non-default partition between the datums at minoff and
* maxoff. Add the default partition in that case.
*/
if (partition_bound_has_default(boundinfo))
{
for (i = minoff; i <= maxoff; i++)
{
if (partindices[i] < 0)
{
result->scan_default = true;
break;
}
}
}
Assert(minoff >= 0 && maxoff >= 0);
result->bound_offsets = bms_add_range(NULL, minoff, maxoff);
}
else if (off >= 0) /* !is_equal */
else
{
/*
* The lookup value falls in the range between some bounds in
* boundinfo. 'off' would be the offset of the greatest bound
* that is <= lookup value, so add off + 1 to the result
* instead as the offset of the upper bound of the only
* partition that may contain the lookup value.
*/
if (partindices[off + 1] >= 0)
result->bound_offsets = bms_make_singleton(off + 1);
else
result->scan_default =
partition_bound_has_default(boundinfo);
}
else
{
/*
* off < 0: the lookup value is smaller than all bounds, so
* only the default partition qualifies, if there is one.
* partition that may contain the lookup value. If 'off' is
* -1 indicating that all bounds are greater, then we simply
* end up adding the first bound's offset, that is, 0.
*/
result->scan_default = partition_bound_has_default(boundinfo);
result->bound_offsets = bms_make_singleton(off + 1);
}
return result;
......@@ -2945,16 +2926,18 @@ get_matching_range_bounds(PartitionPruneContext *context,
minoff = inclusive ? off : off + 1;
}
/*
* lookup value falls in the range between some bounds in
* boundinfo. off would be the offset of the greatest bound
* that is <= lookup value, so add off + 1 to the result
* instead as the offset of the upper bound of the smallest
* partition that may contain the lookup value.
*/
else
{
/*
* lookup value falls in the range between some bounds in
* boundinfo. off would be the offset of the greatest
* bound that is <= lookup value, so add off + 1 to the
* result instead as the offset of the upper bound of the
* smallest partition that may contain the lookup value.
*/
minoff = off + 1;
}
}
break;
......@@ -2972,16 +2955,7 @@ get_matching_range_bounds(PartitionPruneContext *context,
boundinfo,
nvalues, values,
&is_equal);
if (off < 0)
{
/*
* All bounds are greater than the key, so we could only
* expect to find the lookup key in the default partition.
*/
result->scan_default = partition_bound_has_default(boundinfo);
return result;
}
else
if (off >= 0)
{
/*
* See the comment above.
......@@ -3029,6 +3003,14 @@ get_matching_range_bounds(PartitionPruneContext *context,
else
maxoff = off;
}
else
{
/*
* 'off' is -1 indicating that all bounds are greater, so just
* set the first bound's offset as maxoff.
*/
maxoff = off + 1;
}
break;
default:
......@@ -3036,58 +3018,43 @@ get_matching_range_bounds(PartitionPruneContext *context,
break;
}
Assert(minoff >= 0 && minoff <= boundinfo->ndatums);
Assert(maxoff >= 0 && maxoff <= boundinfo->ndatums);
/*
* Skip a gap and when doing so, check if the bound contains a finite
* value to decide if we need to add the default partition. If it's an
* infinite bound, we need not add the default partition, as having an
* infinite bound means the partition in question catches any values that
* would otherwise be in the default partition.
* If the smallest partition to return has MINVALUE (negative infinity) as
* its lower bound, increment it to point to the next finite bound
* (supposedly its upper bound), so that we don't advertently end up
* scanning the default partition.
*/
if (partindices[minoff] < 0)
if (minoff < boundinfo->ndatums && partindices[minoff] < 0)
{
int lastkey = nvalues - 1;
if (minoff >= 0 &&
minoff < boundinfo->ndatums &&
boundinfo->kind[minoff][lastkey] ==
PARTITION_RANGE_DATUM_VALUE)
result->scan_default = partition_bound_has_default(boundinfo);
minoff++;
if (boundinfo->kind[minoff][lastkey] ==
PARTITION_RANGE_DATUM_MINVALUE)
{
minoff++;
Assert(boundinfo->indexes[minoff] >= 0);
}
}
/*
* Skip a gap. See the above comment about how we decide whether or not
* to scan the default partition based whether the datum that will become
* the maximum datum is finite or not.
* If the previous greatest partition has MAXVALUE (positive infinity) as
* its upper bound (something only possible to do with multi-column range
* partitioning), we scan switch to it as the greatest partition to
* return. Again, so that we don't advertently end up scanning the
* default partition.
*/
if (maxoff >= 1 && partindices[maxoff] < 0)
{
int lastkey = nvalues - 1;
if (maxoff >= 0 &&
maxoff <= boundinfo->ndatums &&
boundinfo->kind[maxoff - 1][lastkey] ==
PARTITION_RANGE_DATUM_VALUE)
result->scan_default = partition_bound_has_default(boundinfo);
maxoff--;
}
if (partition_bound_has_default(boundinfo))
{
/*
* There may exist a range of values unassigned to any non-default
* partition between the datums at minoff and maxoff. Add the default
* partition in that case.
*/
for (i = minoff; i <= maxoff; i++)
if (boundinfo->kind[maxoff - 1][lastkey] ==
PARTITION_RANGE_DATUM_MAXVALUE)
{
if (partindices[i] < 0)
{
result->scan_default = true;
break;
}
maxoff--;
Assert(boundinfo->indexes[maxoff] >= 0);
}
}
......@@ -3332,14 +3299,24 @@ perform_pruning_combine_step(PartitionPruneContext *context,
/*
* A combine step without any source steps is an indication to not perform
* any partition pruning, we just return all partitions.
* any partition pruning. Return all datum indexes in that case.
*/
result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
if (list_length(cstep->source_stepids) == 0)
{
PartitionBoundInfo boundinfo = context->boundinfo;
int rangemax;
/*
* Add all valid offsets into the boundinfo->indexes array. For range
* partitioning, boundinfo->indexes contains (boundinfo->ndatums + 1)
* valid entries; otherwise there are boundinfo->ndatums.
*/
rangemax = context->strategy == PARTITION_STRATEGY_RANGE ?
boundinfo->ndatums : boundinfo->ndatums - 1;
result->bound_offsets = bms_add_range(NULL, 0, boundinfo->ndatums - 1);
result->bound_offsets =
bms_add_range(result->bound_offsets, 0, rangemax);
result->scan_default = partition_bound_has_default(boundinfo);
result->scan_null = partition_bound_accepts_nulls(boundinfo);
return result;
......
......@@ -56,7 +56,6 @@
* pointed by remainder produced when hash value of the datum-tuple is divided
* by the greatest modulus.
*/
typedef struct PartitionBoundInfoData
{
char strategy; /* hash, list or range? */
......
......@@ -504,15 +504,13 @@ explain (costs off) select * from rlp where a <= 31;
Filter: (a <= 31)
-> Seq Scan on rlp5_1
Filter: (a <= 31)
-> Seq Scan on rlp5_default
Filter: (a <= 31)
-> Seq Scan on rlp_default_10
Filter: (a <= 31)
-> Seq Scan on rlp_default_30
Filter: (a <= 31)
-> Seq Scan on rlp_default_default
Filter: (a <= 31)
(29 rows)
(27 rows)
explain (costs off) select * from rlp where a = 1 or a = 7;
QUERY PLAN
......@@ -559,11 +557,7 @@ explain (costs off) select * from rlp where a > 20 and a < 27;
Filter: ((a > 20) AND (a < 27))
-> Seq Scan on rlp4_2
Filter: ((a > 20) AND (a < 27))
-> Seq Scan on rlp4_default
Filter: ((a > 20) AND (a < 27))
-> Seq Scan on rlp_default_default
Filter: ((a > 20) AND (a < 27))
(9 rows)
(5 rows)
explain (costs off) select * from rlp where a = 29;
QUERY PLAN
......@@ -588,6 +582,16 @@ explain (costs off) select * from rlp where a >= 29;
Filter: (a >= 29)
(11 rows)
explain (costs off) select * from rlp where a < 1 or (a > 20 and a < 25);
QUERY PLAN
------------------------------------------------------
Append
-> Seq Scan on rlp1
Filter: ((a < 1) OR ((a > 20) AND (a < 25)))
-> Seq Scan on rlp4_1
Filter: ((a < 1) OR ((a > 20) AND (a < 25)))
(5 rows)
-- redundant clauses are eliminated
explain (costs off) select * from rlp where a > 1 and a = 10; /* only default */
QUERY PLAN
......
......@@ -83,6 +83,7 @@ explain (costs off) select * from rlp where a = 1 or b = 'ab';
explain (costs off) select * from rlp where a > 20 and a < 27;
explain (costs off) select * from rlp where a = 29;
explain (costs off) select * from rlp where a >= 29;
explain (costs off) select * from rlp where a < 1 or (a > 20 and a < 25);
-- redundant clauses are eliminated
explain (costs off) select * from rlp where a > 1 and a = 10; /* only default */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment