Commit 955a684e authored by Andres Freund's avatar Andres Freund

Fix race condition leading to hanging logical slot creation.

The snapshot assembly during the creation of logical slots relied
waiting for transactions in xl_running_xacts to end, by checking for
their commit/abort records.  Unfortunately, despite locking, it is
possible to see an xl_running_xact record listing transactions as
ready, that have already WAL-logged an commit/abort record, as the
locking just prevents the ProcArray to be adjusted, and the commit
record has to be logged first.

That lead to either delayed or hanging snapshot creation, because
snapbuild.c would wait "forever" to see commit/abort records for some
transactions.  That hang resolved only if a xl_running_xacts record
without any running transactions happened to be logged, far from
certain on a busy server.

It's impractical to prevent that via more heavyweight locking, the
likelihood of deadlocks and significantly increased contention would
be too big.

Instead change the initial snapshot creation to be solely based on
tracking the oldest running transaction via
xl_running_xacts->oldestRunningXid - that actually ends up
significantly simplifying the code.  That has two disadvantages:
1) Because we cannot fully "trust" the contents of xl_running_xacts,
   we cannot use it to build the initial snapshot.  Instead we have to
   wait twice for all running transactions to finish.
2) Previously a slot, unless the race occurred, could be created when
   the all transaction perceived as running based on commit/abort
   records, now we have to wait for the next xl_running_xacts record.
To address that, trigger logging new xl_running_xacts record from
within snapbuild.c exactly when necessary.

Unfortunately snabuild.c's SnapBuild is stored on disk, one of the
stupider ideas of a certain Mr Freund, so we can't change it in a
minor release.  As this is going to be backpatched, we have to hack
around a bit to keep on-disk compatibility.  A later commit will
rejigger that on master.

Author: Andres Freund, based on a quite different patch from Petr Jelinek
Analyzed-By: Petr Jelinek
Reviewed-By: Petr Jelinek
Discussion: https://postgr.es/m/f37e975c-908f-858e-707f-058d3b1eb214@2ndquadrant.com
Backpatch: 9.4-, where logical decoding has been introduced
parent 9aab83fc
Parsed test spec with 3 sessions
starting permutation: s2txid s1init s3txid s2alter s2c s1insert s1checkpoint s1start s1insert s1alter s1insert s1start
step s2txid: BEGIN ISOLATION LEVEL REPEATABLE READ; SELECT txid_current() IS NULL;
starting permutation: s2b s2txid s1init s3b s3txid s2alter s2c s2b s2txid s3c s2c s1insert s1checkpoint s1start s1insert s1alter s1insert s1start
step s2b: BEGIN;
step s2txid: SELECT txid_current() IS NULL;
?column?
f
step s1init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); <waiting ...>
step s3txid: BEGIN ISOLATION LEVEL REPEATABLE READ; SELECT txid_current() IS NULL;
step s3b: BEGIN;
step s3txid: SELECT txid_current() IS NULL;
?column?
f
step s2alter: ALTER TABLE do_write ADD COLUMN addedbys2 int;
step s2c: COMMIT;
step s2b: BEGIN;
step s2txid: SELECT txid_current() IS NULL;
?column?
f
step s3c: COMMIT;
step s1init: <... completed>
?column?
init
step s2c: COMMIT;
step s1insert: INSERT INTO do_write DEFAULT VALUES;
step s1checkpoint: CHECKPOINT;
step s1start: SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', 'false');
......
......@@ -24,7 +24,8 @@ step "s1alter" { ALTER TABLE do_write ADD COLUMN addedbys1 int; }
session "s2"
setup { SET synchronous_commit=on; }
step "s2txid" { BEGIN ISOLATION LEVEL REPEATABLE READ; SELECT txid_current() IS NULL; }
step "s2b" { BEGIN; }
step "s2txid" { SELECT txid_current() IS NULL; }
step "s2alter" { ALTER TABLE do_write ADD COLUMN addedbys2 int; }
step "s2c" { COMMIT; }
......@@ -32,7 +33,8 @@ step "s2c" { COMMIT; }
session "s3"
setup { SET synchronous_commit=on; }
step "s3txid" { BEGIN ISOLATION LEVEL REPEATABLE READ; SELECT txid_current() IS NULL; }
step "s3b" { BEGIN; }
step "s3txid" { SELECT txid_current() IS NULL; }
step "s3c" { COMMIT; }
# Force usage of ondisk snapshot by starting and not finishing a
......@@ -40,4 +42,4 @@ step "s3c" { COMMIT; }
# reached. In combination with a checkpoint forcing a snapshot to be
# written and a new restart point computed that'll lead to the usage
# of the snapshot.
permutation "s2txid" "s1init" "s3txid" "s2alter" "s2c" "s1insert" "s1checkpoint" "s1start" "s1insert" "s1alter" "s1insert" "s1start"
permutation "s2b" "s2txid" "s1init" "s3b" "s3txid" "s2alter" "s2c" "s2b" "s2txid" "s3c" "s2c" "s1insert" "s1checkpoint" "s1start" "s1insert" "s1alter" "s1insert" "s1start"
......@@ -622,9 +622,6 @@ DecodeAbort(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
{
int i;
SnapBuildAbortTxn(ctx->snapshot_builder, buf->record->EndRecPtr, xid,
parsed->nsubxacts, parsed->subxacts);
for (i = 0; i < parsed->nsubxacts; i++)
{
ReorderBufferAbort(ctx->reorder, parsed->subxacts[i],
......
......@@ -1725,7 +1725,7 @@ ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
{
elog(DEBUG1, "aborting old transaction %u", txn->xid);
elog(DEBUG2, "aborting old transaction %u", txn->xid);
/* remove potential on-disk data, and deallocate this tx */
ReorderBufferCleanupTXN(rb, txn);
......
This diff is collapsed.
......@@ -20,24 +20,30 @@ typedef enum
/*
* Initial state, we can't do much yet.
*/
SNAPBUILD_START,
SNAPBUILD_START = -1,
/*
* Collecting committed transactions, to build the initial catalog
* snapshot.
*/
SNAPBUILD_BUILDING_SNAPSHOT = 0,
/*
* We have collected enough information to decode tuples in transactions
* that started after this.
*
* Once we reached this we start to collect changes. We cannot apply them
* yet because the might be based on transactions that were still running
* when we reached them yet.
* yet, because they might be based on transactions that were still running
* when FULL_SNAPSHOT was reached.
*/
SNAPBUILD_FULL_SNAPSHOT,
SNAPBUILD_FULL_SNAPSHOT = 1,
/*
* Found a point after hitting built_full_snapshot where all transactions
* that were running at that point finished. Till we reach that we hold
* off calling any commit callbacks.
* Found a point after SNAPBUILD_FULL_SNAPSHOT where all transactions that
* were running at that point finished. Till we reach that we hold off
* calling any commit callbacks.
*/
SNAPBUILD_CONSISTENT
SNAPBUILD_CONSISTENT = 2
} SnapBuildState;
/* forward declare so we don't have to expose the struct to the public */
......@@ -73,9 +79,6 @@ extern bool SnapBuildXactNeedsSkip(SnapBuild *snapstate, XLogRecPtr ptr);
extern void SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn,
TransactionId xid, int nsubxacts,
TransactionId *subxacts);
extern void SnapBuildAbortTxn(SnapBuild *builder, XLogRecPtr lsn,
TransactionId xid, int nsubxacts,
TransactionId *subxacts);
extern bool SnapBuildProcessChange(SnapBuild *builder, TransactionId xid,
XLogRecPtr lsn);
extern void SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment