Commit 90c885cd authored by Andres Freund's avatar Andres Freund

Increment xactCompletionCount during subtransaction abort.

Snapshot caching, introduced in 623a9ba7, did not increment
xactCompletionCount during subtransaction abort. That could lead to an older
snapshot being reused. That is, at least as far as I can see, not a
correctness issue (for MVCC snapshots there's no difference between "in
progress" and "aborted"). The only difference between the old and new
snapshots would be a newer ->xmax.

While HeapTupleSatisfiesMVCC makes the same visibility determination, reusing
the old snapshot leads HeapTupleSatisfiesMVCC to not set
HEAP_XMIN_INVALID. Which subsequently causes the kill_prior_tuple optimization
to not kick in (via HeapTupleIsSurelyDead() returning false). The performance
effects of doing the same index-lookups over and over again is how the issue
was discovered...

Fix the issue by incrementing xactCompletionCount in
XidCacheRemoveRunningXids. It already acquires ProcArrayLock exclusively,
making that an easy proposition.

Add a test to ensure that kill_prior_tuple prevents index growth when it
involves aborted subtransaction of the current transaction.

Author: Andres Freund
Discussion: https://postgr.es/m/20210406043521.lopeo7bbigad3n6t@alap3.anarazel.de
Discussion: https://postgr.es/m/20210317055718.v6qs3ltzrformqoa%40alap3.anarazel.de
parent 8523492d
...@@ -1210,6 +1210,11 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running) ...@@ -1210,6 +1210,11 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
*/ */
MaintainLatestCompletedXidRecovery(running->latestCompletedXid); MaintainLatestCompletedXidRecovery(running->latestCompletedXid);
/*
* NB: No need to increment ShmemVariableCache->xactCompletionCount here,
* nobody can see it yet.
*/
LWLockRelease(ProcArrayLock); LWLockRelease(ProcArrayLock);
/* ShmemVariableCache->nextXid must be beyond any observed xid. */ /* ShmemVariableCache->nextXid must be beyond any observed xid. */
...@@ -3915,6 +3920,9 @@ XidCacheRemoveRunningXids(TransactionId xid, ...@@ -3915,6 +3920,9 @@ XidCacheRemoveRunningXids(TransactionId xid,
/* Also advance global latestCompletedXid while holding the lock */ /* Also advance global latestCompletedXid while holding the lock */
MaintainLatestCompletedXid(latestXid); MaintainLatestCompletedXid(latestXid);
/* ... and xactCompletionCount */
ShmemVariableCache->xactCompletionCount++;
LWLockRelease(ProcArrayLock); LWLockRelease(ProcArrayLock);
} }
......
--
-- Verify that index scans encountering dead rows produced by an
-- aborted subtransaction of the current transaction can utilize the
-- kill_prio_tuple optimization
--
-- NB: The table size is currently *not* expected to stay the same, we
-- don't have logic to trigger opportunistic pruning in cases like
-- this.
BEGIN;
SET LOCAL enable_seqscan = false;
SET LOCAL enable_indexonlyscan = false;
SET LOCAL enable_bitmapscan = false;
-- Can't easily use a unique index, since dead tuples can be found
-- independent of the kill_prior_tuples optimization.
CREATE TABLE clean_aborted_self(key int, data text);
CREATE INDEX clean_aborted_self_key ON clean_aborted_self(key);
INSERT INTO clean_aborted_self (key, data) VALUES (-1, 'just to allocate metapage');
-- save index size from before the changes, for comparison
SELECT pg_relation_size('clean_aborted_self_key') AS clean_aborted_self_key_before \gset
DO $$
BEGIN
-- iterate often enough to see index growth even on larger-than-default page sizes
FOR i IN 1..100 LOOP
BEGIN
-- perform index scan over all the inserted keys to get them to be seen as dead
IF EXISTS(SELECT * FROM clean_aborted_self WHERE key > 0 AND key < 100) THEN
RAISE data_corrupted USING MESSAGE = 'these rows should not exist';
END IF;
INSERT INTO clean_aborted_self SELECT g.i, 'rolling back in a sec' FROM generate_series(1, 100) g(i);
-- just some error that's not normally thrown
RAISE reading_sql_data_not_permitted USING MESSAGE = 'round and round again';
EXCEPTION WHEN reading_sql_data_not_permitted THEN END;
END LOOP;
END;$$;
-- show sizes only if they differ
SELECT :clean_aborted_self_key_before AS size_before, pg_relation_size('clean_aborted_self_key') size_after
WHERE :clean_aborted_self_key_before != pg_relation_size('clean_aborted_self_key');
size_before | size_after
-------------+------------
(0 rows)
ROLLBACK;
...@@ -29,7 +29,7 @@ test: strings numerology point lseg line box path polygon circle date time timet ...@@ -29,7 +29,7 @@ test: strings numerology point lseg line box path polygon circle date time timet
# geometry depends on point, lseg, box, path, polygon and circle # geometry depends on point, lseg, box, path, polygon and circle
# horology depends on interval, timetz, timestamp, timestamptz # horology depends on interval, timetz, timestamp, timestamptz
# ---------- # ----------
test: geometry horology regex type_sanity opr_sanity misc_sanity comments expressions unicode xid test: geometry horology regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc
# ---------- # ----------
# These four each depend on the previous one # These four each depend on the previous one
......
...@@ -11,6 +11,7 @@ test: int4 ...@@ -11,6 +11,7 @@ test: int4
test: int8 test: int8
test: oid test: oid
test: xid test: xid
test: mvcc
test: float4 test: float4
test: float8 test: float8
test: bit test: bit
......
--
-- Verify that index scans encountering dead rows produced by an
-- aborted subtransaction of the current transaction can utilize the
-- kill_prio_tuple optimization
--
-- NB: The table size is currently *not* expected to stay the same, we
-- don't have logic to trigger opportunistic pruning in cases like
-- this.
BEGIN;
SET LOCAL enable_seqscan = false;
SET LOCAL enable_indexonlyscan = false;
SET LOCAL enable_bitmapscan = false;
-- Can't easily use a unique index, since dead tuples can be found
-- independent of the kill_prior_tuples optimization.
CREATE TABLE clean_aborted_self(key int, data text);
CREATE INDEX clean_aborted_self_key ON clean_aborted_self(key);
INSERT INTO clean_aborted_self (key, data) VALUES (-1, 'just to allocate metapage');
-- save index size from before the changes, for comparison
SELECT pg_relation_size('clean_aborted_self_key') AS clean_aborted_self_key_before \gset
DO $$
BEGIN
-- iterate often enough to see index growth even on larger-than-default page sizes
FOR i IN 1..100 LOOP
BEGIN
-- perform index scan over all the inserted keys to get them to be seen as dead
IF EXISTS(SELECT * FROM clean_aborted_self WHERE key > 0 AND key < 100) THEN
RAISE data_corrupted USING MESSAGE = 'these rows should not exist';
END IF;
INSERT INTO clean_aborted_self SELECT g.i, 'rolling back in a sec' FROM generate_series(1, 100) g(i);
-- just some error that's not normally thrown
RAISE reading_sql_data_not_permitted USING MESSAGE = 'round and round again';
EXCEPTION WHEN reading_sql_data_not_permitted THEN END;
END LOOP;
END;$$;
-- show sizes only if they differ
SELECT :clean_aborted_self_key_before AS size_before, pg_relation_size('clean_aborted_self_key') size_after
WHERE :clean_aborted_self_key_before != pg_relation_size('clean_aborted_self_key');
ROLLBACK;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment