Commit f481d282 authored by Alvaro Herrera's avatar Alvaro Herrera

Check default partitions constraints while descending

Partitioning tuple route code assumes that the partition chosen while
descending the partition hierarchy is always the correct one.  This is
true except when the partition is the default partition and another
partition has been added concurrently: the partition constraint changes
and we don't recheck it.  This can lead to tuples mistakenly being added
to the default partition that should have been rejected.

Fix by rechecking the default partition constraint while descending the
hierarchy.

An isolation test based on the reproduction steps described by Hao Wu
(with tweaks for extra coverage) is included.

Backpatch to 12, where this bug came in with 898e5e32.

Reported by: Hao Wu <hawu@vmware.com>
Author: Amit Langote <amitlangote09@gmail.com>
Author: Álvaro Herrera <alvherre@alvh.no-ip.org>
Discussion: https://postgr.es/m/CA+HiwqFqBmcSSap4sFnCBUEL_VfOMmEKaQ3gwUhyfa4c7J_-nA@mail.gmail.com
Discussion: https://postgr.es/m/DM5PR0501MB3910E97A9EDFB4C775CF3D75A42F0@DM5PR0501MB3910.namprd05.prod.outlook.com
parent c9ae5cbb
...@@ -51,6 +51,11 @@ ...@@ -51,6 +51,11 @@
* PartitionDispatchData->indexes for details on how this array is * PartitionDispatchData->indexes for details on how this array is
* indexed. * indexed.
* *
* nonleaf_partitions
* Array of 'max_dispatch' elements containing pointers to fake
* ResultRelInfo objects for nonleaf partitions, useful for checking
* the partition constraint.
*
* num_dispatch * num_dispatch
* The current number of items stored in the 'partition_dispatch_info' * The current number of items stored in the 'partition_dispatch_info'
* array. Also serves as the index of the next free array element for * array. Also serves as the index of the next free array element for
...@@ -89,6 +94,7 @@ struct PartitionTupleRouting ...@@ -89,6 +94,7 @@ struct PartitionTupleRouting
{ {
Relation partition_root; Relation partition_root;
PartitionDispatch *partition_dispatch_info; PartitionDispatch *partition_dispatch_info;
ResultRelInfo **nonleaf_partitions;
int num_dispatch; int num_dispatch;
int max_dispatch; int max_dispatch;
ResultRelInfo **partitions; ResultRelInfo **partitions;
...@@ -280,9 +286,11 @@ ExecFindPartition(ModifyTableState *mtstate, ...@@ -280,9 +286,11 @@ ExecFindPartition(ModifyTableState *mtstate,
PartitionDispatch dispatch; PartitionDispatch dispatch;
PartitionDesc partdesc; PartitionDesc partdesc;
ExprContext *ecxt = GetPerTupleExprContext(estate); ExprContext *ecxt = GetPerTupleExprContext(estate);
TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple; TupleTableSlot *ecxt_scantuple_saved = ecxt->ecxt_scantuple;
TupleTableSlot *rootslot = slot;
TupleTableSlot *myslot = NULL; TupleTableSlot *myslot = NULL;
MemoryContext oldcxt; MemoryContext oldcxt;
ResultRelInfo *rri = NULL;
/* use per-tuple context here to avoid leaking memory */ /* use per-tuple context here to avoid leaking memory */
oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
...@@ -296,9 +304,8 @@ ExecFindPartition(ModifyTableState *mtstate, ...@@ -296,9 +304,8 @@ ExecFindPartition(ModifyTableState *mtstate,
/* start with the root partitioned table */ /* start with the root partitioned table */
dispatch = pd[0]; dispatch = pd[0];
while (true) while (dispatch != NULL)
{ {
AttrMap *map = dispatch->tupmap;
int partidx = -1; int partidx = -1;
CHECK_FOR_INTERRUPTS(); CHECK_FOR_INTERRUPTS();
...@@ -306,17 +313,6 @@ ExecFindPartition(ModifyTableState *mtstate, ...@@ -306,17 +313,6 @@ ExecFindPartition(ModifyTableState *mtstate,
rel = dispatch->reldesc; rel = dispatch->reldesc;
partdesc = dispatch->partdesc; partdesc = dispatch->partdesc;
/*
* Convert the tuple to this parent's layout, if different from the
* current relation.
*/
myslot = dispatch->tupslot;
if (myslot != NULL)
{
Assert(map != NULL);
slot = execute_attr_map_slot(map, slot, myslot);
}
/* /*
* Extract partition key from tuple. Expression evaluation machinery * Extract partition key from tuple. Expression evaluation machinery
* that FormPartitionKeyDatum() invokes expects ecxt_scantuple to * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
...@@ -352,11 +348,9 @@ ExecFindPartition(ModifyTableState *mtstate, ...@@ -352,11 +348,9 @@ ExecFindPartition(ModifyTableState *mtstate,
if (partdesc->is_leaf[partidx]) if (partdesc->is_leaf[partidx])
{ {
ResultRelInfo *rri;
/* /*
* Look to see if we've already got a ResultRelInfo for this * We've reached the leaf -- hurray, we're done. Look to see if
* partition. * we've already got a ResultRelInfo for this partition.
*/ */
if (likely(dispatch->indexes[partidx] >= 0)) if (likely(dispatch->indexes[partidx] >= 0))
{ {
...@@ -400,14 +394,10 @@ ExecFindPartition(ModifyTableState *mtstate, ...@@ -400,14 +394,10 @@ ExecFindPartition(ModifyTableState *mtstate,
dispatch, dispatch,
rootResultRelInfo, partidx); rootResultRelInfo, partidx);
} }
Assert(rri != NULL);
/* Release the tuple in the lowest parent's dedicated slot. */ /* Signal to terminate the loop */
if (slot == myslot) dispatch = NULL;
ExecClearTuple(myslot);
MemoryContextSwitchTo(oldcxt);
ecxt->ecxt_scantuple = ecxt_scantuple_old;
return rri;
} }
else else
{ {
...@@ -419,6 +409,8 @@ ExecFindPartition(ModifyTableState *mtstate, ...@@ -419,6 +409,8 @@ ExecFindPartition(ModifyTableState *mtstate,
/* Already built. */ /* Already built. */
Assert(dispatch->indexes[partidx] < proute->num_dispatch); Assert(dispatch->indexes[partidx] < proute->num_dispatch);
rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
/* /*
* Move down to the next partition level and search again * Move down to the next partition level and search again
* until we find a leaf partition that matches this tuple * until we find a leaf partition that matches this tuple
...@@ -440,10 +432,75 @@ ExecFindPartition(ModifyTableState *mtstate, ...@@ -440,10 +432,75 @@ ExecFindPartition(ModifyTableState *mtstate,
dispatch, partidx); dispatch, partidx);
Assert(dispatch->indexes[partidx] >= 0 && Assert(dispatch->indexes[partidx] >= 0 &&
dispatch->indexes[partidx] < proute->num_dispatch); dispatch->indexes[partidx] < proute->num_dispatch);
rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
dispatch = subdispatch; dispatch = subdispatch;
} }
/*
* Convert the tuple to the new parent's layout, if different from
* the previous parent.
*/
if (dispatch->tupslot)
{
AttrMap *map = dispatch->tupmap;
TupleTableSlot *tempslot = myslot;
myslot = dispatch->tupslot;
slot = execute_attr_map_slot(map, slot, myslot);
if (tempslot != NULL)
ExecClearTuple(tempslot);
}
}
/*
* If this partition is the default one, we must check its partition
* constraint now, which may have changed concurrently due to
* partitions being added to the parent.
*
* (We do this here, and do not rely on ExecInsert doing it, because
* we don't want to miss doing it for non-leaf partitions.)
*/
if (partidx == partdesc->boundinfo->default_index)
{
PartitionRoutingInfo *partrouteinfo = rri->ri_PartitionInfo;
/*
* The tuple must match the partition's layout for the constraint
* expression to be evaluated successfully. If the partition is
* sub-partitioned, that would already be the case due to the code
* above, but for a leaf partition the tuple still matches the
* parent's layout.
*
* Note that we have a map to convert from root to current
* partition, but not from immediate parent to current partition.
* So if we have to convert, do it from the root slot; if not, use
* the root slot as-is.
*/
if (partrouteinfo)
{
TupleConversionMap *map = partrouteinfo->pi_RootToPartitionMap;
if (map)
slot = execute_attr_map_slot(map->attrMap, rootslot,
partrouteinfo->pi_PartitionTupleSlot);
else
slot = rootslot;
}
ExecPartitionCheck(rri, slot, estate, true);
} }
} }
/* Release the tuple in the lowest parent's dedicated slot. */
if (myslot != NULL)
ExecClearTuple(myslot);
/* and restore ecxt's scantuple */
ecxt->ecxt_scantuple = ecxt_scantuple_saved;
MemoryContextSwitchTo(oldcxt);
return rri;
} }
/* /*
...@@ -1060,6 +1117,8 @@ ExecInitPartitionDispatchInfo(EState *estate, ...@@ -1060,6 +1117,8 @@ ExecInitPartitionDispatchInfo(EState *estate,
proute->max_dispatch = 4; proute->max_dispatch = 4;
proute->partition_dispatch_info = (PartitionDispatch *) proute->partition_dispatch_info = (PartitionDispatch *)
palloc(sizeof(PartitionDispatch) * proute->max_dispatch); palloc(sizeof(PartitionDispatch) * proute->max_dispatch);
proute->nonleaf_partitions = (ResultRelInfo **)
palloc(sizeof(ResultRelInfo *) * proute->max_dispatch);
} }
else else
{ {
...@@ -1067,10 +1126,28 @@ ExecInitPartitionDispatchInfo(EState *estate, ...@@ -1067,10 +1126,28 @@ ExecInitPartitionDispatchInfo(EState *estate,
proute->partition_dispatch_info = (PartitionDispatch *) proute->partition_dispatch_info = (PartitionDispatch *)
repalloc(proute->partition_dispatch_info, repalloc(proute->partition_dispatch_info,
sizeof(PartitionDispatch) * proute->max_dispatch); sizeof(PartitionDispatch) * proute->max_dispatch);
proute->nonleaf_partitions = (ResultRelInfo **)
repalloc(proute->nonleaf_partitions,
sizeof(ResultRelInfo *) * proute->max_dispatch);
} }
} }
proute->partition_dispatch_info[dispatchidx] = pd; proute->partition_dispatch_info[dispatchidx] = pd;
/*
* If setting up a PartitionDispatch for a sub-partitioned table, we may
* also need a minimally valid ResultRelInfo for checking the partition
* constraint later; set that up now.
*/
if (parent_pd)
{
ResultRelInfo *rri = makeNode(ResultRelInfo);
InitResultRelInfo(rri, rel, 1, proute->partition_root, 0);
proute->nonleaf_partitions[dispatchidx] = rri;
}
else
proute->nonleaf_partitions[dispatchidx] = NULL;
/* /*
* Finally, if setting up a PartitionDispatch for a sub-partitioned table, * Finally, if setting up a PartitionDispatch for a sub-partitioned table,
* install a downlink in the parent to allow quick descent. * install a downlink in the parent to allow quick descent.
......
Parsed test spec with 2 sessions
starting permutation: s1b s1a s2b s2i s1c s2c s2s
step s1b: begin;
step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200);
step s2b: begin;
step s2i: insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz'); <waiting ...>
step s1c: commit;
step s2i: <... completed>
error in steps s1c s2i: ERROR: new row for relation "tpart_default" violates partition constraint
step s2c: commit;
step s2s: select tableoid::regclass, * from tpart;
tableoid i j
tpart_2 110 xxx
tpart_2 120 yyy
tpart_2 150 zzz
starting permutation: s1b s1a s2b s2i2 s1c s2c s2s
step s1b: begin;
step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200);
step s2b: begin;
step s2i2: insert into tpart_default (i, j) values (110, 'xxx'), (120, 'yyy'), (150, 'zzz'); <waiting ...>
step s1c: commit;
step s2i2: <... completed>
error in steps s1c s2i2: ERROR: new row for relation "tpart_default" violates partition constraint
step s2c: commit;
step s2s: select tableoid::regclass, * from tpart;
tableoid i j
tpart_2 110 xxx
tpart_2 120 yyy
tpart_2 150 zzz
starting permutation: s1b s2b s2i s1a s2c s1c s2s
step s1b: begin;
step s2b: begin;
step s2i: insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz');
step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200); <waiting ...>
step s2c: commit;
step s1a: <... completed>
error in steps s2c s1a: ERROR: updated partition constraint for default partition "tpart_default_default" would be violated by some row
step s1c: commit;
step s2s: select tableoid::regclass, * from tpart;
tableoid i j
tpart_default_default110 xxx
tpart_default_default120 yyy
tpart_default_default150 zzz
...@@ -81,6 +81,7 @@ test: vacuum-skip-locked ...@@ -81,6 +81,7 @@ test: vacuum-skip-locked
test: predicate-hash test: predicate-hash
test: predicate-gist test: predicate-gist
test: predicate-gin test: predicate-gin
test: partition-concurrent-attach
test: partition-key-update-1 test: partition-key-update-1
test: partition-key-update-2 test: partition-key-update-2
test: partition-key-update-3 test: partition-key-update-3
......
# Verify that default partition constraint is enforced correctly
# in light of partitions being added concurrently to its parent
setup {
drop table if exists tpart;
create table tpart(i int, j text) partition by range(i);
create table tpart_1(like tpart);
create table tpart_2(like tpart);
create table tpart_default (a int, j text, i int) partition by list (j);
create table tpart_default_default (a int, i int, b int, j text);
alter table tpart_default_default drop b;
alter table tpart_default attach partition tpart_default_default default;
alter table tpart_default drop a;
alter table tpart attach partition tpart_default default;
alter table tpart attach partition tpart_1 for values from(0) to (100);
insert into tpart_2 values (110,'xxx'), (120, 'yyy'), (150, 'zzz');
}
session "s1"
step "s1b" { begin; }
step "s1a" { alter table tpart attach partition tpart_2 for values from (100) to (200); }
step "s1c" { commit; }
session "s2"
step "s2b" { begin; }
step "s2i" { insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz'); }
step "s2i2" { insert into tpart_default (i, j) values (110, 'xxx'), (120, 'yyy'), (150, 'zzz'); }
step "s2c" { commit; }
step "s2s" { select tableoid::regclass, * from tpart; }
teardown { drop table tpart; }
# insert into tpart by s2 which routes to tpart_default due to not seeing
# concurrently added tpart_2 should fail, because the partition constraint
# of tpart_default would have changed due to tpart_2 having been added
permutation "s1b" "s1a" "s2b" "s2i" "s1c" "s2c" "s2s"
# similar to above, but now insert into sub-partitioned tpart_default
permutation "s1b" "s1a" "s2b" "s2i2" "s1c" "s2c" "s2s"
# reverse: now the insert into tpart_default by s2 occurs first followed by
# attach in s1, which should fail when it scans the leaf default partition
# find the violating rows
permutation "s1b" "s2b" "s2i" "s1a" "s2c" "s1c" "s2s"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment