Commit 40ca70eb authored by Amit Kapila's avatar Amit Kapila

Allow using the updated tuple while moving it to a different partition.

An update that causes the tuple to be moved to a different partition was
missing out on re-constructing the to-be-updated tuple, based on the latest
tuple in the update chain.  Instead, it's simply deleting the latest tuple
and inserting a new tuple in the new partition based on the old tuple.
Commit 2f178441 didn't consider this case, so some of the updates were
getting lost.

In passing, change the argument order for output parameter in ExecDelete
and add some commentary about it.

Reported-by: Pavan Deolasee
Author: Amit Khandekar, with minor changes by me
Reviewed-by: Dilip Kumar, Amit Kapila and Alvaro Herrera
Backpatch-through: 11
Discussion: https://postgr.es/m/CAJ3gD9fRbEzDqdeDq1jxqZUb47kJn+tQ7=Bcgjc8quqKsDViKQ@mail.gmail.com
parent edc6b41b
......@@ -2726,11 +2726,19 @@ ExecASDeleteTriggers(EState *estate, ResultRelInfo *relinfo,
false, NULL, NULL, NIL, NULL, transition_capture);
}
/*
* Execute BEFORE ROW DELETE triggers.
*
* True indicates caller can proceed with the delete. False indicates caller
* need to suppress the delete and additionally if requested, we need to pass
* back the concurrently updated tuple if any.
*/
bool
ExecBRDeleteTriggers(EState *estate, EPQState *epqstate,
ResultRelInfo *relinfo,
ItemPointer tupleid,
HeapTuple fdw_trigtuple)
HeapTuple fdw_trigtuple,
TupleTableSlot **epqslot)
{
TriggerDesc *trigdesc = relinfo->ri_TrigDesc;
bool result = true;
......@@ -2747,6 +2755,18 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate,
LockTupleExclusive, &newSlot);
if (trigtuple == NULL)
return false;
/*
* If the tuple was concurrently updated and the caller of this
* function requested for the updated tuple, skip the trigger
* execution.
*/
if (newSlot != NULL && epqslot != NULL)
{
*epqslot = newSlot;
heap_freetuple(trigtuple);
return false;
}
}
else
trigtuple = fdw_trigtuple;
......
......@@ -531,7 +531,7 @@ ExecSimpleRelationDelete(EState *estate, EPQState *epqstate,
{
skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo,
&searchslot->tts_tuple->t_self,
NULL);
NULL, NULL);
}
if (!skip_tuple)
......
......@@ -609,7 +609,11 @@ ExecInsert(ModifyTableState *mtstate,
* foreign table, tupleid is invalid; the FDW has to figure out
* which row to delete using data from the planSlot. oldtuple is
* passed to foreign table triggers; it is NULL when the foreign
* table has no relevant triggers.
* table has no relevant triggers. We use tupleDeleted to indicate
* whether the tuple is actually deleted, callers can use it to
* decide whether to continue the operation. When this DELETE is a
* part of an UPDATE of partition-key, then the slot returned by
* EvalPlanQual() is passed back using output parameter epqslot.
*
* Returns RETURNING result if any, otherwise NULL.
* ----------------------------------------------------------------
......@@ -621,10 +625,11 @@ ExecDelete(ModifyTableState *mtstate,
TupleTableSlot *planSlot,
EPQState *epqstate,
EState *estate,
bool *tupleDeleted,
bool processReturning,
bool canSetTag,
bool changingPart)
bool changingPart,
bool *tupleDeleted,
TupleTableSlot **epqslot)
{
ResultRelInfo *resultRelInfo;
Relation resultRelationDesc;
......@@ -649,7 +654,7 @@ ExecDelete(ModifyTableState *mtstate,
bool dodelete;
dodelete = ExecBRDeleteTriggers(estate, epqstate, resultRelInfo,
tupleid, oldtuple);
tupleid, oldtuple, epqslot);
if (!dodelete) /* "do nothing" */
return NULL;
......@@ -769,19 +774,30 @@ ldelete:;
if (!ItemPointerEquals(tupleid, &hufd.ctid))
{
TupleTableSlot *epqslot;
epqslot = EvalPlanQual(estate,
epqstate,
resultRelationDesc,
resultRelInfo->ri_RangeTableIndex,
LockTupleExclusive,
&hufd.ctid,
hufd.xmax);
if (!TupIsNull(epqslot))
TupleTableSlot *my_epqslot;
my_epqslot = EvalPlanQual(estate,
epqstate,
resultRelationDesc,
resultRelInfo->ri_RangeTableIndex,
LockTupleExclusive,
&hufd.ctid,
hufd.xmax);
if (!TupIsNull(my_epqslot))
{
*tupleid = hufd.ctid;
goto ldelete;
/*
* If requested, skip delete and pass back the updated
* row.
*/
if (epqslot)
{
*epqslot = my_epqslot;
return NULL;
}
else
goto ldelete;
}
}
/* tuple already deleted; nothing to do */
......@@ -1052,6 +1068,7 @@ lreplace:;
{
bool tuple_deleted;
TupleTableSlot *ret_slot;
TupleTableSlot *epqslot = NULL;
PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
int map_index;
TupleConversionMap *tupconv_map;
......@@ -1081,8 +1098,8 @@ lreplace:;
* processing. We want to return rows from INSERT.
*/
ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate,
estate, &tuple_deleted, false,
false /* canSetTag */ , true /* changingPart */ );
estate, false, false /* canSetTag */ ,
true /* changingPart */ , &tuple_deleted, &epqslot);
/*
* For some reason if DELETE didn't happen (e.g. trigger prevented
......@@ -1105,7 +1122,23 @@ lreplace:;
* resurrect it.
*/
if (!tuple_deleted)
return NULL;
{
/*
* epqslot will be typically NULL. But when ExecDelete()
* finds that another transaction has concurrently updated the
* same row, it re-fetches the row, skips the delete, and
* epqslot is set to the re-fetched tuple slot. In that case,
* we need to do all the checks again.
*/
if (TupIsNull(epqslot))
return NULL;
else
{
slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, epqslot);
tuple = ExecMaterializeSlot(slot);
goto lreplace;
}
}
/*
* Updates set the transition capture map only when a new subplan
......@@ -2136,8 +2169,8 @@ ExecModifyTable(PlanState *pstate)
case CMD_DELETE:
slot = ExecDelete(node, tupleid, oldtuple, planSlot,
&node->mt_epqstate, estate,
NULL, true, node->canSetTag,
false /* changingPart */ );
true, node->canSetTag,
false /* changingPart */ , NULL, NULL);
break;
default:
elog(ERROR, "unknown operation");
......
......@@ -206,7 +206,8 @@ extern bool ExecBRDeleteTriggers(EState *estate,
EPQState *epqstate,
ResultRelInfo *relinfo,
ItemPointer tupleid,
HeapTuple fdw_trigtuple);
HeapTuple fdw_trigtuple,
TupleTableSlot **epqslot);
extern void ExecARDeleteTriggers(EState *estate,
ResultRelInfo *relinfo,
ItemPointer tupleid,
......
Parsed test spec with 2 sessions
starting permutation: s1b s2b s2u1 s1u s2c s1c s1s
step s1b: BEGIN ISOLATION LEVEL READ COMMITTED;
step s2b: BEGIN ISOLATION LEVEL READ COMMITTED;
step s2u1: UPDATE foo SET b = b || ' update2' WHERE a = 1;
step s1u: UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; <waiting ...>
step s2c: COMMIT;
step s1u: <... completed>
step s1c: COMMIT;
step s1s: SELECT tableoid::regclass, * FROM foo ORDER BY a;
tableoid a b
foo2 2 ABC update2 update1
starting permutation: s1b s2b s2ut1 s1ut s2c s1c s1st s1stl
step s1b: BEGIN ISOLATION LEVEL READ COMMITTED;
step s2b: BEGIN ISOLATION LEVEL READ COMMITTED;
step s2ut1: UPDATE footrg SET b = b || ' update2' WHERE a = 1;
step s1ut: UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; <waiting ...>
step s2c: COMMIT;
step s1ut: <... completed>
step s1c: COMMIT;
step s1st: SELECT tableoid::regclass, * FROM footrg ORDER BY a;
tableoid a b
footrg2 2 ABC update2 update1
step s1stl: SELECT * FROM triglog ORDER BY a;
a b
1 ABC update2 trigger
starting permutation: s1b s2b s2u2 s1u s2c s1c s1s
step s1b: BEGIN ISOLATION LEVEL READ COMMITTED;
step s2b: BEGIN ISOLATION LEVEL READ COMMITTED;
step s2u2: UPDATE foo SET b = 'EFG' WHERE a = 1;
step s1u: UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; <waiting ...>
step s2c: COMMIT;
step s1u: <... completed>
step s1c: COMMIT;
step s1s: SELECT tableoid::regclass, * FROM foo ORDER BY a;
tableoid a b
foo1 1 EFG
starting permutation: s1b s2b s2ut2 s1ut s2c s1c s1st s1stl
step s1b: BEGIN ISOLATION LEVEL READ COMMITTED;
step s2b: BEGIN ISOLATION LEVEL READ COMMITTED;
step s2ut2: UPDATE footrg SET b = 'EFG' WHERE a = 1;
step s1ut: UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; <waiting ...>
step s2c: COMMIT;
step s1ut: <... completed>
step s1c: COMMIT;
step s1st: SELECT tableoid::regclass, * FROM footrg ORDER BY a;
tableoid a b
footrg1 1 EFG
step s1stl: SELECT * FROM triglog ORDER BY a;
a b
......@@ -74,4 +74,5 @@ test: predicate-gin-nomatch
test: partition-key-update-1
test: partition-key-update-2
test: partition-key-update-3
test: partition-key-update-4
test: plpgsql-toast
# Test that a row that ends up in a new partition contains changes made by
# a concurrent transaction.
setup
{
--
-- Setup to test concurrent handling of ExecDelete().
--
CREATE TABLE foo (a int, b text) PARTITION BY LIST(a);
CREATE TABLE foo1 PARTITION OF foo FOR VALUES IN (1);
CREATE TABLE foo2 PARTITION OF foo FOR VALUES IN (2);
INSERT INTO foo VALUES (1, 'ABC');
--
-- Setup to test concurrent handling of GetTupleForTrigger().
--
CREATE TABLE footrg (a int, b text) PARTITION BY LIST(a);
CREATE TABLE triglog as select * from footrg;
CREATE TABLE footrg1 PARTITION OF footrg FOR VALUES IN (1);
CREATE TABLE footrg2 PARTITION OF footrg FOR VALUES IN (2);
INSERT INTO footrg VALUES (1, 'ABC');
CREATE FUNCTION func_footrg() RETURNS TRIGGER AS $$
BEGIN
OLD.b = OLD.b || ' trigger';
-- This will verify that the trigger is not run *before* the row is
-- refetched by EvalPlanQual. The OLD row should contain the changes made
-- by the concurrent session.
INSERT INTO triglog select OLD.*;
RETURN OLD;
END $$ LANGUAGE PLPGSQL;
CREATE TRIGGER footrg_ondel BEFORE DELETE ON footrg1
FOR EACH ROW EXECUTE PROCEDURE func_footrg();
}
teardown
{
DROP TABLE foo;
DROP TRIGGER footrg_ondel ON footrg1;
DROP FUNCTION func_footrg();
DROP TABLE footrg;
DROP TABLE triglog;
}
session "s1"
step "s1b" { BEGIN ISOLATION LEVEL READ COMMITTED; }
step "s1u" { UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; }
step "s1ut" { UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; }
step "s1s" { SELECT tableoid::regclass, * FROM foo ORDER BY a; }
step "s1st" { SELECT tableoid::regclass, * FROM footrg ORDER BY a; }
step "s1stl" { SELECT * FROM triglog ORDER BY a; }
step "s1c" { COMMIT; }
session "s2"
step "s2b" { BEGIN ISOLATION LEVEL READ COMMITTED; }
step "s2u1" { UPDATE foo SET b = b || ' update2' WHERE a = 1; }
step "s2u2" { UPDATE foo SET b = 'EFG' WHERE a = 1; }
step "s2ut1" { UPDATE footrg SET b = b || ' update2' WHERE a = 1; }
step "s2ut2" { UPDATE footrg SET b = 'EFG' WHERE a = 1; }
step "s2c" { COMMIT; }
# Session s1 is moving a row into another partition, but is waiting for
# another session s2 that is updating the original row. The row that ends up
# in the new partition should contain the changes made by session s2.
permutation "s1b" "s2b" "s2u1" "s1u" "s2c" "s1c" "s1s"
# Same as above, except, session s1 is waiting in GetTupleTrigger().
permutation "s1b" "s2b" "s2ut1" "s1ut" "s2c" "s1c" "s1st" "s1stl"
# Below two cases are similar to the above two; except that the session s1
# fails EvalPlanQual() test, so partition key update does not happen.
permutation "s1b" "s2b" "s2u2" "s1u" "s2c" "s1c" "s1s"
permutation "s1b" "s2b" "s2ut2" "s1ut" "s2c" "s1c" "s1st" "s1stl"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment