Increment xactCompletionCount during subtransaction abort.

Snapshot caching, introduced in 623a9ba7, did not increment xactCompletionCount during subtransaction abort. That could lead to an older snapshot being reused. That is, at least as far as I can see, not a correctness issue (for MVCC snapshots there's no difference between "in progress" and "aborted"). The only difference between the old and new snapshots would be a newer ->xmax. While HeapTupleSatisfiesMVCC makes the same visibility determination, reusing the old snapshot leads HeapTupleSatisfiesMVCC to not set HEAP_XMIN_INVALID. Which subsequently causes the kill_prior_tuple optimization to not kick in (via HeapTupleIsSurelyDead() returning false). The performance effects of doing the same index-lookups over and over again is how the issue was discovered... Fix the issue by incrementing xactCompletionCount in XidCacheRemoveRunningXids. It already acquires ProcArrayLock exclusively, making that an easy proposition. Add a test to ensure that kill_prior_tuple prevents index growth when it involves aborted subtransaction of the current transaction. Author: Andres Freund Discussion: https://postgr.es/m/20210406043521.lopeo7bbigad3n6t@alap3.anarazel.de Discussion: https://postgr.es/m/20210317055718.v6qs3ltzrformqoa%40alap3.anarazel.de

Increment xactCompletionCount during subtransaction abort.
Snapshot caching, introduced in 623a9ba7, did not increment xactCompletionCount during subtransaction abort. That could lead to an older snapshot being reused. That is, at least as far as I can see, not a correctness issue (for MVCC snapshots there's no difference between "in progress" and "aborted"). The only difference between the old and new snapshots would be a newer ->xmax. While HeapTupleSatisfiesMVCC makes the same visibility determination, reusing the old snapshot leads HeapTupleSatisfiesMVCC to not set HEAP_XMIN_INVALID. Which subsequently causes the kill_prior_tuple optimization to not kick in (via HeapTupleIsSurelyDead() returning false). The performance effects of doing the same index-lookups over and over again is how the issue was discovered... Fix the issue by incrementing xactCompletionCount in XidCacheRemoveRunningXids. It already acquires ProcArrayLock exclusively, making that an easy proposition. Add a test to ensure that kill_prior_tuple prevents index growth when it involves aborted subtransaction of the current transaction. Author: Andres Freund Discussion: https://postgr.es/m/20210406043521.lopeo7bbigad3n6t@alap3.anarazel.de Discussion: https://postgr.es/m/20210317055718.v6qs3ltzrformqoa%40alap3.anarazel.de
90c885cd · Andres Freund · 8523492d · 90c885cd · 90c885cd · 90c885cd
Commit 90c885cd authored Apr 06, 2021 by Andres Freund
5 changed files
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -1210,6 +1210,11 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
 	 */
 	MaintainLatestCompletedXidRecovery(running->latestCompletedXid);
+	/*
+	 * NB: No need to increment ShmemVariableCache->xactCompletionCount here,
+	 * nobody can see it yet.
+	 */
 	LWLockRelease(ProcArrayLock);
 	/* ShmemVariableCache->nextXid must be beyond any observed xid. */
@@ -3915,6 +3920,9 @@ XidCacheRemoveRunningXids(TransactionId xid,
 	/* Also advance global latestCompletedXid while holding the lock */
 	MaintainLatestCompletedXid(latestXid);
+	/* ... and xactCompletionCount */
+	ShmemVariableCache->xactCompletionCount++;
 	LWLockRelease(ProcArrayLock);
 }

--- a/src/test/regress/expected/mvcc.out
+++ b/src/test/regress/expected/mvcc.out
+--
+-- Verify that index scans encountering dead rows produced by an
+-- aborted subtransaction of the current transaction can utilize the
+-- kill_prio_tuple optimization
+--
+-- NB: The table size is currently *not* expected to stay the same, we
+-- don't have logic to trigger opportunistic pruning in cases like
+-- this.
+BEGIN;
+SET LOCAL enable_seqscan = false;
+SET LOCAL enable_indexonlyscan = false;
+SET LOCAL enable_bitmapscan = false;
+-- Can't easily use a unique index, since dead tuples can be found
+-- independent of the kill_prior_tuples optimization.
+CREATE TABLE clean_aborted_self(key int, data text);
+CREATE INDEX clean_aborted_self_key ON clean_aborted_self(key);
+INSERT INTO clean_aborted_self (key, data) VALUES (-1, 'just to allocate metapage');
+-- save index size from before the changes, for comparison
+SELECT pg_relation_size('clean_aborted_self_key') AS clean_aborted_self_key_before \gset
+DO $$
+BEGIN
+    -- iterate often enough to see index growth even on larger-than-default page sizes
+    FOR i IN 1..100 LOOP
+        BEGIN
+	    -- perform index scan over all the inserted keys to get them to be seen as dead
+            IF EXISTS(SELECT * FROM clean_aborted_self WHERE key > 0 AND key < 100) THEN
+	        RAISE data_corrupted USING MESSAGE = 'these rows should not exist';
+            END IF;
+            INSERT INTO clean_aborted_self SELECT g.i, 'rolling back in a sec' FROM generate_series(1, 100) g(i);
+	    -- just some error that's not normally thrown
+	    RAISE reading_sql_data_not_permitted USING MESSAGE = 'round and round again';
+	EXCEPTION WHEN reading_sql_data_not_permitted THEN END;
+    END LOOP;
+END;$$;
+-- show sizes only if they differ
+SELECT :clean_aborted_self_key_before AS size_before, pg_relation_size('clean_aborted_self_key') size_after
+WHERE :clean_aborted_self_key_before != pg_relation_size('clean_aborted_self_key');
+ size_before | size_after 
+-------------+------------
+(0 rows)
+ROLLBACK;
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -29,7 +29,7 @@ test: strings numerology point lseg line box path polygon circle date time timet
 # geometry depends on point, lseg, box, path, polygon and circle
 # horology depends on interval, timetz, timestamp, timestamptz
 # ----------
-test: geometry horology regex type_sanity opr_sanity misc_sanity comments expressions unicode xid
+test: geometry horology regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc
 # ----------
 # These four each depend on the previous one

--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -11,6 +11,7 @@ test: int4
 test: int8
 test: oid
 test: xid
+test: mvcc
 test: float4
 test: float8
 test: bit

--- a/src/test/regress/sql/mvcc.sql
+++ b/src/test/regress/sql/mvcc.sql
+--
+-- Verify that index scans encountering dead rows produced by an
+-- aborted subtransaction of the current transaction can utilize the
+-- kill_prio_tuple optimization
+--
+-- NB: The table size is currently *not* expected to stay the same, we
+-- don't have logic to trigger opportunistic pruning in cases like
+-- this.
+BEGIN;
+SET LOCAL enable_seqscan = false;
+SET LOCAL enable_indexonlyscan = false;
+SET LOCAL enable_bitmapscan = false;
+-- Can't easily use a unique index, since dead tuples can be found
+-- independent of the kill_prior_tuples optimization.
+CREATE TABLE clean_aborted_self(key int, data text);
+CREATE INDEX clean_aborted_self_key ON clean_aborted_self(key);
+INSERT INTO clean_aborted_self (key, data) VALUES (-1, 'just to allocate metapage');
+-- save index size from before the changes, for comparison
+SELECT pg_relation_size('clean_aborted_self_key') AS clean_aborted_self_key_before \gset
+DO $$
+BEGIN
+    -- iterate often enough to see index growth even on larger-than-default page sizes
+    FOR i IN 1..100 LOOP
+        BEGIN
+	    -- perform index scan over all the inserted keys to get them to be seen as dead
+            IF EXISTS(SELECT * FROM clean_aborted_self WHERE key > 0 AND key < 100) THEN
+	        RAISE data_corrupted USING MESSAGE = 'these rows should not exist';
+            END IF;
+            INSERT INTO clean_aborted_self SELECT g.i, 'rolling back in a sec' FROM generate_series(1, 100) g(i);
+	    -- just some error that's not normally thrown
+	    RAISE reading_sql_data_not_permitted USING MESSAGE = 'round and round again';
+	EXCEPTION WHEN reading_sql_data_not_permitted THEN END;
+    END LOOP;
+END;$$;
+-- show sizes only if they differ
+SELECT :clean_aborted_self_key_before AS size_before, pg_relation_size('clean_aborted_self_key') size_after
+WHERE :clean_aborted_self_key_before != pg_relation_size('clean_aborted_self_key');
+ROLLBACK;