Commit b508a56f authored by Teodor Sigaev's avatar Teodor Sigaev

Predicate locking in hash indexes.

Hash index searches acquire predicate locks on the primary
page of a bucket. It acquires a lock on both the old and new buckets
for scans that happen concurrently with page splits. During a bucket
split, a predicate lock is copied from the primary page of an old
bucket to the primary page of a new bucket.

Author: Shubham Barai, Amit Kapila
Reviewed by: Amit Kapila, Alexander Korotkov, Thomas Munro
Discussion: https://www.postgresql.org/message-id/flat/CALxAEPvNsM2GTiXdRgaaZ1Pjd1bs+sxfFsf7Ytr+iq+5JJoYXA@mail.gmail.com
parent 971d7ddb
...@@ -68,7 +68,7 @@ hashhandler(PG_FUNCTION_ARGS) ...@@ -68,7 +68,7 @@ hashhandler(PG_FUNCTION_ARGS)
amroutine->amsearchnulls = false; amroutine->amsearchnulls = false;
amroutine->amstorage = false; amroutine->amstorage = false;
amroutine->amclusterable = false; amroutine->amclusterable = false;
amroutine->ampredlocks = false; amroutine->ampredlocks = true;
amroutine->amcanparallel = false; amroutine->amcanparallel = false;
amroutine->amkeytype = INT4OID; amroutine->amkeytype = INT4OID;
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "utils/rel.h" #include "utils/rel.h"
#include "storage/lwlock.h" #include "storage/lwlock.h"
#include "storage/buf_internals.h" #include "storage/buf_internals.h"
#include "storage/predicate.h"
static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf, static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf,
RelFileNode hnode); RelFileNode hnode);
...@@ -88,6 +89,8 @@ restart_insert: ...@@ -88,6 +89,8 @@ restart_insert:
&usedmetap); &usedmetap);
Assert(usedmetap != NULL); Assert(usedmetap != NULL);
CheckForSerializableConflictIn(rel, NULL, buf);
/* remember the primary bucket buffer to release the pin on it at end. */ /* remember the primary bucket buffer to release the pin on it at end. */
bucket_buf = buf; bucket_buf = buf;
......
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include "miscadmin.h" #include "miscadmin.h"
#include "storage/lmgr.h" #include "storage/lmgr.h"
#include "storage/smgr.h" #include "storage/smgr.h"
#include "storage/predicate.h"
static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,
...@@ -1107,6 +1108,11 @@ _hash_splitbucket(Relation rel, ...@@ -1107,6 +1108,11 @@ _hash_splitbucket(Relation rel,
npage = BufferGetPage(nbuf); npage = BufferGetPage(nbuf);
nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
/* Copy the predicate locks from old bucket to new bucket. */
PredicateLockPageSplit(rel,
BufferGetBlockNumber(bucket_obuf),
BufferGetBlockNumber(bucket_nbuf));
/* /*
* Partition the tuples in the old bucket between the old bucket and the * Partition the tuples in the old bucket between the old bucket and the
* new bucket, advancing along the old bucket's overflow bucket chain and * new bucket, advancing along the old bucket's overflow bucket chain and
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "miscadmin.h" #include "miscadmin.h"
#include "pgstat.h" #include "pgstat.h"
#include "utils/rel.h" #include "utils/rel.h"
#include "storage/predicate.h"
static bool _hash_readpage(IndexScanDesc scan, Buffer *bufP, static bool _hash_readpage(IndexScanDesc scan, Buffer *bufP,
ScanDirection dir); ScanDirection dir);
...@@ -171,6 +172,7 @@ _hash_readnext(IndexScanDesc scan, ...@@ -171,6 +172,7 @@ _hash_readnext(IndexScanDesc scan,
Assert(BufferIsValid(*bufp)); Assert(BufferIsValid(*bufp));
LockBuffer(*bufp, BUFFER_LOCK_SHARE); LockBuffer(*bufp, BUFFER_LOCK_SHARE);
PredicateLockPage(rel, BufferGetBlockNumber(*bufp), scan->xs_snapshot);
/* /*
* setting hashso_buc_split to true indicates that we are scanning * setting hashso_buc_split to true indicates that we are scanning
...@@ -347,6 +349,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) ...@@ -347,6 +349,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
so->hashso_sk_hash = hashkey; so->hashso_sk_hash = hashkey;
buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL); buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL);
PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
page = BufferGetPage(buf); page = BufferGetPage(buf);
TestForOldSnapshot(scan->xs_snapshot, rel, page); TestForOldSnapshot(scan->xs_snapshot, rel, page);
opaque = (HashPageOpaque) PageGetSpecialPointer(page); opaque = (HashPageOpaque) PageGetSpecialPointer(page);
......
...@@ -389,6 +389,13 @@ relation is required. Fast update postpones the insertion of tuples into index ...@@ -389,6 +389,13 @@ relation is required. Fast update postpones the insertion of tuples into index
structure by temporarily storing them into pending list. That makes us unable structure by temporarily storing them into pending list. That makes us unable
to detect r-w conflicts using page-level locks. to detect r-w conflicts using page-level locks.
* Hash index searches acquire predicate locks on the primary
page of a bucket. It acquires a lock on both the old and new buckets
for scans that happen concurrently with page splits. During a bucket
split, a predicate lock is copied from the primary page of an old
bucket to the primary page of a new bucket.
* The effects of page splits, overflows, consolidations, and * The effects of page splits, overflows, consolidations, and
removals must be carefully reviewed to ensure that predicate locks removals must be carefully reviewed to ensure that predicate locks
aren't "lost" during those operations, or kept with pages which could aren't "lost" during those operations, or kept with pages which could
......
This diff is collapsed.
...@@ -70,6 +70,7 @@ test: async-notify ...@@ -70,6 +70,7 @@ test: async-notify
test: vacuum-reltuples test: vacuum-reltuples
test: timeouts test: timeouts
test: vacuum-concurrent-drop test: vacuum-concurrent-drop
test: predicate-hash
test: predicate-gist test: predicate-gist
test: predicate-gin test: predicate-gin
# The checksum_enable suite will enable checksums for the cluster so should # The checksum_enable suite will enable checksums for the cluster so should
......
# Test for page level predicate locking in hash index
#
# Test to verify serialization failures and to check reduced false positives
#
# To verify serialization failures, queries and permutations are written in such
# a way that an index scan (from one transaction) and an index insert (from
# another transaction) will try to access the same bucket of the index
# whereas to check reduced false positives, they will try to access different
# buckets of the index.
setup
{
create table hash_tbl(id int4, p integer);
create index hash_idx on hash_tbl using hash(p);
insert into hash_tbl (id, p)
select g, 10 from generate_series(1, 10) g;
insert into hash_tbl (id, p)
select g, 20 from generate_series(11, 20) g;
insert into hash_tbl (id, p)
select g, 30 from generate_series(21, 30) g;
insert into hash_tbl (id, p)
select g, 40 from generate_series(31, 40) g;
}
teardown
{
drop table hash_tbl;
}
session "s1"
setup
{
begin isolation level serializable;
set enable_seqscan=off;
set enable_bitmapscan=off;
set enable_indexonlyscan=on;
}
step "rxy1" { select sum(p) from hash_tbl where p=20; }
step "wx1" { insert into hash_tbl (id, p)
select g, 30 from generate_series(41, 50) g; }
step "rxy3" { select sum(p) from hash_tbl where p=20; }
step "wx3" { insert into hash_tbl (id, p)
select g, 50 from generate_series(41, 50) g; }
step "c1" { commit; }
session "s2"
setup
{
begin isolation level serializable;
set enable_seqscan=off;
set enable_bitmapscan=off;
set enable_indexonlyscan=on;
}
step "rxy2" { select sum(p) from hash_tbl where p=30; }
step "wy2" { insert into hash_tbl (id, p)
select g, 20 from generate_series(51, 60) g; }
step "rxy4" { select sum(p) from hash_tbl where p=30; }
step "wy4" { insert into hash_tbl (id, p)
select g, 60 from generate_series(51, 60) g; }
step "c2" { commit; }
# An index scan (from one transaction) and an index insert (from another
# transaction) try to access the same bucket of the index but one transaction
# commits before other transaction begins so no r-w conflict.
permutation "rxy1" "wx1" "c1" "rxy2" "wy2" "c2"
permutation "rxy2" "wy2" "c2" "rxy1" "wx1" "c1"
# An index scan (from one transaction) and an index insert (from another
# transaction) try to access different buckets of the index and also one
# transaction commits before other transaction begins, so no r-w conflict.
permutation "rxy3" "wx3" "c1" "rxy4" "wy4" "c2"
permutation "rxy4" "wy4" "c2" "rxy3" "wx3" "c1"
# An index scan (from one transaction) and an index insert (from another
# transaction) try to access the same bucket of the index and one transaction
# begins before other transaction commits so there is a r-w conflict.
permutation "rxy1" "wx1" "rxy2" "c1" "wy2" "c2"
permutation "rxy1" "wx1" "rxy2" "wy2" "c1" "c2"
permutation "rxy1" "wx1" "rxy2" "wy2" "c2" "c1"
permutation "rxy1" "rxy2" "wx1" "c1" "wy2" "c2"
permutation "rxy1" "rxy2" "wx1" "wy2" "c1" "c2"
permutation "rxy1" "rxy2" "wx1" "wy2" "c2" "c1"
permutation "rxy1" "rxy2" "wy2" "wx1" "c1" "c2"
permutation "rxy1" "rxy2" "wy2" "wx1" "c2" "c1"
permutation "rxy1" "rxy2" "wy2" "c2" "wx1" "c1"
permutation "rxy2" "rxy1" "wx1" "c1" "wy2" "c2"
permutation "rxy2" "rxy1" "wx1" "wy2" "c1" "c2"
permutation "rxy2" "rxy1" "wx1" "wy2" "c2" "c1"
permutation "rxy2" "rxy1" "wy2" "wx1" "c1" "c2"
permutation "rxy2" "rxy1" "wy2" "wx1" "c2" "c1"
permutation "rxy2" "rxy1" "wy2" "c2" "wx1" "c1"
permutation "rxy2" "wy2" "rxy1" "wx1" "c1" "c2"
permutation "rxy2" "wy2" "rxy1" "wx1" "c2" "c1"
permutation "rxy2" "wy2" "rxy1" "c2" "wx1" "c1"
# An index scan (from one transaction) and an index insert (from another
# transaction) try to access different buckets of the index so no r-w conflict.
permutation "rxy3" "wx3" "rxy4" "c1" "wy4" "c2"
permutation "rxy3" "wx3" "rxy4" "wy4" "c1" "c2"
permutation "rxy3" "wx3" "rxy4" "wy4" "c2" "c1"
permutation "rxy3" "rxy4" "wx3" "c1" "wy4" "c2"
permutation "rxy3" "rxy4" "wx3" "wy4" "c1" "c2"
permutation "rxy3" "rxy4" "wx3" "wy4" "c2" "c1"
permutation "rxy3" "rxy4" "wy4" "wx3" "c1" "c2"
permutation "rxy3" "rxy4" "wy4" "wx3" "c2" "c1"
permutation "rxy3" "rxy4" "wy4" "c2" "wx3" "c1"
permutation "rxy4" "rxy3" "wx3" "c1" "wy4" "c2"
permutation "rxy4" "rxy3" "wx3" "wy4" "c1" "c2"
permutation "rxy4" "rxy3" "wx3" "wy4" "c2" "c1"
permutation "rxy4" "rxy3" "wy4" "wx3" "c1" "c2"
permutation "rxy4" "rxy3" "wy4" "wx3" "c2" "c1"
permutation "rxy4" "rxy3" "wy4" "c2" "wx3" "c1"
permutation "rxy4" "wy4" "rxy3" "wx3" "c1" "c2"
permutation "rxy4" "wy4" "rxy3" "wx3" "c2" "c1"
permutation "rxy4" "wy4" "rxy3" "c2" "wx3" "c1"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment