Predicate locking in hash indexes.

Hash index searches acquire predicate locks on the primary page of a bucket. It acquires a lock on both the old and new buckets for scans that happen concurrently with page splits. During a bucket split, a predicate lock is copied from the primary page of an old bucket to the primary page of a new bucket. Author: Shubham Barai, Amit Kapila Reviewed by: Amit Kapila, Alexander Korotkov, Thomas Munro Discussion: https://www.postgresql.org/message-id/flat/CALxAEPvNsM2GTiXdRgaaZ1Pjd1bs+sxfFsf7Ytr+iq+5JJoYXA@mail.gmail.com

Predicate locking in hash indexes.
Hash index searches acquire predicate locks on the primary page of a bucket. It acquires a lock on both the old and new buckets for scans that happen concurrently with page splits. During a bucket split, a predicate lock is copied from the primary page of an old bucket to the primary page of a new bucket. Author: Shubham Barai, Amit Kapila Reviewed by: Amit Kapila, Alexander Korotkov, Thomas Munro Discussion: https://www.postgresql.org/message-id/flat/CALxAEPvNsM2GTiXdRgaaZ1Pjd1bs+sxfFsf7Ytr+iq+5JJoYXA@mail.gmail.com
b508a56f · Teodor Sigaev · 971d7ddb · b508a56f · b508a56f · b508a56f
Commit b508a56f authored Apr 07, 2018 by Teodor Sigaev
8 changed files
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -68,7 +68,7 @@ hashhandler(PG_FUNCTION_ARGS)
 	amroutine->amsearchnulls = false;
 	amroutine->amstorage = false;
 	amroutine->amclusterable = false;
-	amroutine->ampredlocks = false;
+	amroutine->ampredlocks = true;
 	amroutine->amcanparallel = false;
 	amroutine->amkeytype = INT4OID;

--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -22,6 +22,7 @@
 #include "utils/rel.h"
 #include "storage/lwlock.h"
 #include "storage/buf_internals.h"
+#include "storage/predicate.h"
 static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf,
 					  RelFileNode hnode);
@@ -88,6 +89,8 @@ restart_insert:
 										  &usedmetap);
 	Assert(usedmetap != NULL);
+	CheckForSerializableConflictIn(rel, NULL, buf);
 	/* remember the primary bucket buffer to release the pin on it at end. */
 	bucket_buf = buf;

--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -33,6 +33,7 @@
 #include "miscadmin.h"
 #include "storage/lmgr.h"
 #include "storage/smgr.h"
+#include "storage/predicate.h"
 static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,
@@ -1107,6 +1108,11 @@ _hash_splitbucket(Relation rel,
 	npage = BufferGetPage(nbuf);
 	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
+	/* Copy the predicate locks from old bucket to new bucket. */
+	PredicateLockPageSplit(rel,
+						   BufferGetBlockNumber(bucket_obuf),
+						   BufferGetBlockNumber(bucket_nbuf));
 	/*
 	 * Partition the tuples in the old bucket between the old bucket and the
 	 * new bucket, advancing along the old bucket's overflow bucket chain and

--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -19,6 +19,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "utils/rel.h"
+#include "storage/predicate.h"
 static bool _hash_readpage(IndexScanDesc scan, Buffer *bufP,
 			   ScanDirection dir);
@@ -171,6 +172,7 @@ _hash_readnext(IndexScanDesc scan,
 		Assert(BufferIsValid(*bufp));
 		LockBuffer(*bufp, BUFFER_LOCK_SHARE);
+		PredicateLockPage(rel, BufferGetBlockNumber(*bufp), scan->xs_snapshot);
 		/*
 		 * setting hashso_buc_split to true indicates that we are scanning
@@ -347,6 +349,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 	so->hashso_sk_hash = hashkey;
 	buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL);
+	PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
 	page = BufferGetPage(buf);
 	TestForOldSnapshot(scan->xs_snapshot, rel, page);
 	opaque = (HashPageOpaque) PageGetSpecialPointer(page);

--- a/src/backend/storage/lmgr/README-SSI
+++ b/src/backend/storage/lmgr/README-SSI
@@ -389,6 +389,13 @@ relation is required. Fast update postpones the insertion of tuples into index
 structure by temporarily storing them into pending list. That makes us unable
 to detect r-w conflicts using page-level locks.
+    * Hash index searches acquire predicate locks on the primary
+page of a bucket. It acquires a lock on both the old and new buckets
+for scans that happen concurrently with page splits. During a bucket
+split, a predicate lock is copied from the primary page of an old
+bucket to the primary page of a new bucket.
    * The effects of page splits, overflows, consolidations, and
 removals must be carefully reviewed to ensure that predicate locks
 aren't "lost" during those operations, or kept with pages which could

--- a/src/test/isolation/expected/predicate-hash.out
+++ b/src/test/isolation/expected/predicate-hash.out
--- a/src/test/isolation/isolation_schedule
+++ b/src/test/isolation/isolation_schedule
@@ -70,6 +70,7 @@ test: async-notify
 test: vacuum-reltuples
 test: timeouts
 test: vacuum-concurrent-drop
+test: predicate-hash
 test: predicate-gist
 test: predicate-gin
 # The checksum_enable suite will enable checksums for the cluster so should

--- a/src/test/isolation/specs/predicate-hash.spec
+++ b/src/test/isolation/specs/predicate-hash.spec
+# Test for page level predicate locking in hash index
+#
+# Test to verify serialization failures and to check reduced false positives
+#
+# To verify serialization failures, queries and permutations are written in such
+# a way that an index scan  (from one transaction) and an index insert (from
+# another transaction) will try to access the same bucket of the index
+# whereas to check reduced false positives, they will try to access different
+# buckets of the index.
+setup
+{
+ create table hash_tbl(id int4, p integer);
+ create index hash_idx on hash_tbl using hash(p);
+ insert into hash_tbl (id, p)
+ select g, 10 from generate_series(1, 10) g;
+ insert into hash_tbl (id, p)
+ select g, 20 from generate_series(11, 20) g;
+ insert into hash_tbl (id, p)
+ select g, 30 from generate_series(21, 30) g;
+ insert into hash_tbl (id, p)
+ select g, 40 from generate_series(31, 40) g;
+}
+teardown
+{
+ drop table hash_tbl;
+}
+session "s1"
+setup		
+{
+ begin isolation level serializable;
+ set enable_seqscan=off;
+ set enable_bitmapscan=off;
+ set enable_indexonlyscan=on;
+}
+step "rxy1"	{ select sum(p) from hash_tbl where p=20; }
+step "wx1"	{ insert into hash_tbl (id, p)
+			  select g, 30 from generate_series(41, 50) g; }
+step "rxy3"	{ select sum(p) from hash_tbl where p=20; }
+step "wx3"	{ insert into hash_tbl (id, p)
+			  select g, 50 from generate_series(41, 50) g; }
+step "c1"	{ commit; }
+session "s2"
+setup		
+{
+ begin isolation level serializable;
+ set enable_seqscan=off;
+ set enable_bitmapscan=off;
+ set enable_indexonlyscan=on;
+}
+step "rxy2"	{ select sum(p) from hash_tbl where p=30; }
+step "wy2"	{ insert into hash_tbl (id, p)
+			  select g, 20 from generate_series(51, 60) g; }
+step "rxy4"	{ select sum(p) from hash_tbl where p=30; }
+step "wy4"	{ insert into hash_tbl (id, p)
+			  select g, 60 from generate_series(51, 60) g; }
+step "c2"	{ commit; }
+# An index scan (from one transaction) and an index insert (from another
+# transaction) try to access the same bucket of the index but one transaction
+# commits before other transaction begins so no r-w conflict.
+permutation "rxy1" "wx1" "c1" "rxy2" "wy2" "c2"
+permutation "rxy2" "wy2" "c2" "rxy1" "wx1" "c1"
+# An index scan (from one transaction) and an index insert (from another
+# transaction) try to access different buckets of the index and also one
+# transaction commits before other transaction begins, so no r-w conflict.
+permutation "rxy3" "wx3" "c1" "rxy4" "wy4" "c2"
+permutation "rxy4" "wy4" "c2" "rxy3" "wx3" "c1"
+# An index scan (from one transaction) and an index insert (from another
+# transaction) try to access the same bucket of the index and one transaction
+# begins before other transaction commits so there is a r-w conflict.
+permutation "rxy1" "wx1" "rxy2" "c1" "wy2" "c2"
+permutation "rxy1" "wx1" "rxy2" "wy2" "c1" "c2"
+permutation "rxy1" "wx1" "rxy2" "wy2" "c2" "c1"
+permutation "rxy1" "rxy2" "wx1" "c1" "wy2" "c2"
+permutation "rxy1" "rxy2" "wx1" "wy2" "c1" "c2"
+permutation "rxy1" "rxy2" "wx1" "wy2" "c2" "c1"
+permutation "rxy1" "rxy2" "wy2" "wx1" "c1" "c2"
+permutation "rxy1" "rxy2" "wy2" "wx1" "c2" "c1"
+permutation "rxy1" "rxy2" "wy2" "c2" "wx1" "c1"
+permutation "rxy2" "rxy1" "wx1" "c1" "wy2" "c2"
+permutation "rxy2" "rxy1" "wx1" "wy2" "c1" "c2"
+permutation "rxy2" "rxy1" "wx1" "wy2" "c2" "c1"
+permutation "rxy2" "rxy1" "wy2" "wx1" "c1" "c2"
+permutation "rxy2" "rxy1" "wy2" "wx1" "c2" "c1"
+permutation "rxy2" "rxy1" "wy2" "c2" "wx1" "c1"
+permutation "rxy2" "wy2" "rxy1" "wx1" "c1" "c2"
+permutation "rxy2" "wy2" "rxy1" "wx1" "c2" "c1"
+permutation "rxy2" "wy2" "rxy1" "c2" "wx1" "c1"
+# An index scan (from one transaction) and an index insert (from another 
+# transaction) try to access different buckets of the index so no r-w conflict.
+permutation "rxy3" "wx3" "rxy4" "c1" "wy4" "c2"
+permutation "rxy3" "wx3" "rxy4" "wy4" "c1" "c2"
+permutation "rxy3" "wx3" "rxy4" "wy4" "c2" "c1"
+permutation "rxy3" "rxy4" "wx3" "c1" "wy4" "c2"
+permutation "rxy3" "rxy4" "wx3" "wy4" "c1" "c2"
+permutation "rxy3" "rxy4" "wx3" "wy4" "c2" "c1"
+permutation "rxy3" "rxy4" "wy4" "wx3" "c1" "c2"
+permutation "rxy3" "rxy4" "wy4" "wx3" "c2" "c1"
+permutation "rxy3" "rxy4" "wy4" "c2" "wx3" "c1"
+permutation "rxy4" "rxy3" "wx3" "c1" "wy4" "c2"
+permutation "rxy4" "rxy3" "wx3" "wy4" "c1" "c2"
+permutation "rxy4" "rxy3" "wx3" "wy4" "c2" "c1"
+permutation "rxy4" "rxy3" "wy4" "wx3" "c1" "c2"
+permutation "rxy4" "rxy3" "wy4" "wx3" "c2" "c1"
+permutation "rxy4" "rxy3" "wy4" "c2" "wx3" "c1"
+permutation "rxy4" "wy4" "rxy3" "wx3" "c1" "c2"
+permutation "rxy4" "wy4" "rxy3" "wx3" "c2" "c1"
+permutation "rxy4" "wy4" "rxy3" "c2" "wx3" "c1"