Reduce use of heavyweight locking inside hash AM.

Avoid using LockPage(rel, 0, lockmode) to protect against changes to the bucket mapping. Instead, an exclusive buffer content lock is now viewed as sufficient permission to modify the metapage, and a shared buffer content lock is used when such modifications need to be prevented. This more relaxed locking regimen makes it possible that, when we're busy getting a heavyweight bucket on the bucket we intend to search or insert into, a bucket split might occur underneath us. To compenate for that possibility, we use a loop-and-retry system: release the metapage content lock, acquire the heavyweight lock on the target bucket, and then reacquire the metapage content lock and check that the bucket mapping has not changed. Normally it hasn't, and we're done. But if by chance it has, we simply unlock the metapage, release the heavyweight lock we acquired previously, lock the new bucket, and loop around again. Even in the worst case we cannot loop very many times here, since we don't split the same bucket again until we've split all the other buckets, and 2^N gets big pretty fast. This results in greatly improved concurrency, because we're effectively replacing two lwlock acquire-and-release cycles in exclusive mode (on one of the lock manager locks) with a single acquire-and-release cycle in shared mode (on the metapage buffer content lock). Testing shows that it's still not quite as good as btree; for that, we'd probably have to find some way of getting rid of the heavyweight bucket locks as well, which does not appear straightforward. Patch by me, review by Jeff Janes.

Reduce use of heavyweight locking inside hash AM.
Avoid using LockPage(rel, 0, lockmode) to protect against changes to the bucket mapping. Instead, an exclusive buffer content lock is now viewed as sufficient permission to modify the metapage, and a shared buffer content lock is used when such modifications need to be prevented. This more relaxed locking regimen makes it possible that, when we're busy getting a heavyweight bucket on the bucket we intend to search or insert into, a bucket split might occur underneath us. To compenate for that possibility, we use a loop-and-retry system: release the metapage content lock, acquire the heavyweight lock on the target bucket, and then reacquire the metapage content lock and check that the bucket mapping has not changed. Normally it hasn't, and we're done. But if by chance it has, we simply unlock the metapage, release the heavyweight lock we acquired previously, lock the new bucket, and loop around again. Even in the worst case we cannot loop very many times here, since we don't split the same bucket again until we've split all the other buckets, and 2^N gets big pretty fast. This results in greatly improved concurrency, because we're effectively replacing two lwlock acquire-and-release cycles in exclusive mode (on one of the lock manager locks) with a single acquire-and-release cycle in shared mode (on the metapage buffer content lock). Testing shows that it's still not quite as good as btree; for that, we'd probably have to find some way of getting rid of the heavyweight bucket locks as well, which does not appear straightforward. Patch by me, review by Jeff Janes.
76837c15 · Robert Haas · 038f3a05 · 76837c15 · 76837c15 · 76837c15
Commit 76837c15 authored Jun 26, 2012 by Robert Haas
4 changed files
--- a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -32,6 +32,8 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 	Buffer		metabuf;
 	HashMetaPage metap;
 	BlockNumber blkno;
+	BlockNumber oldblkno = InvalidBlockNumber;
+	bool		retry = false;
 	Page		page;
 	HashPageOpaque pageopaque;
 	Size		itemsz;
@@ -49,12 +51,6 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
 								 * need to be consistent */

-	/*
-	 * Acquire shared split lock so we can compute the target bucket safely
-	 * (see README).
-	 */
-	_hash_getlock(rel, 0, HASH_SHARE);
-
 	/* Read the metapage */
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
 	metap = HashPageGetMeta(BufferGetPage(metabuf));
@@ -75,24 +71,44 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 			errhint("Values larger than a buffer page cannot be indexed.")));

 	/*
-	 * Compute the target bucket number, and convert to block number.
+	 * Loop until we get a lock on the correct target bucket.
 	 */
-	bucket = _hash_hashkey2bucket(hashkey,
-								  metap->hashm_maxbucket,
-								  metap->hashm_highmask,
-								  metap->hashm_lowmask);
+	for (;;)
+	{
+		/*
+		 * Compute the target bucket number, and convert to block number.
+		 */
+		bucket = _hash_hashkey2bucket(hashkey,
+									  metap->hashm_maxbucket,
+									  metap->hashm_highmask,
+									  metap->hashm_lowmask);

-	blkno = BUCKET_TO_BLKNO(metap, bucket);
+		blkno = BUCKET_TO_BLKNO(metap, bucket);

-	/* release lock on metapage, but keep pin since we'll need it again */
-	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
+		/* Release metapage lock, but keep pin. */
+		_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

-	/*
-	 * Acquire share lock on target bucket; then we can release split lock.
-	 */
-	_hash_getlock(rel, blkno, HASH_SHARE);
+		/*
+		 * If the previous iteration of this loop locked what is still the
+		 * correct target bucket, we are done.  Otherwise, drop any old lock
+		 * and lock what now appears to be the correct bucket.
+		 */
+		if (retry)
+		{
+			if (oldblkno == blkno)
+				break;
+			_hash_droplock(rel, oldblkno, HASH_SHARE);
+		}
+		_hash_getlock(rel, blkno, HASH_SHARE);

-	_hash_droplock(rel, 0, HASH_SHARE);
+		/*
+		 * Reacquire metapage lock and check that no bucket split has taken
+		 * place while we were awaiting the bucket lock.
+		 */
+		_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ);
+		oldblkno = blkno;
+		retry = true;
+	}

 	/* Fetch the primary bucket page for the bucket */
 	buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);

--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -57,9 +57,9 @@ static void _hash_splitbucket(Relation rel, Buffer metabuf,
 /*
 * _hash_getlock() -- Acquire an lmgr lock.
 *
- * 'whichlock' should be zero to acquire the split-control lock, or the
- * block number of a bucket's primary bucket page to acquire the per-bucket
- * lock.  (See README for details of the use of these locks.)
+ * 'whichlock' should the block number of a bucket's primary bucket page to
+ * acquire the per-bucket lock.  (See README for details of the use of these
+ * locks.)
 *
 * 'access' must be HASH_SHARE or HASH_EXCLUSIVE.
 */
@@ -507,21 +507,9 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	uint32		lowmask;

 	/*
-	 * Obtain the page-zero lock to assert the right to begin a split (see
-	 * README).
-	 *
-	 * Note: deadlock should be impossible here. Our own backend could only be
-	 * holding bucket sharelocks due to stopped indexscans; those will not
-	 * block other holders of the page-zero lock, who are only interested in
-	 * acquiring bucket sharelocks themselves.	Exclusive bucket locks are
-	 * only taken here and in hashbulkdelete, and neither of these operations
-	 * needs any additional locks to complete.	(If, due to some flaw in this
-	 * reasoning, we manage to deadlock anyway, it's okay to error out; the
-	 * index will be left in a consistent state.)
+	 * Write-lock the meta page.  It used to be necessary to acquire a
+	 * heavyweight lock to begin a split, but that is no longer required.
 	 */
-	_hash_getlock(rel, 0, HASH_EXCLUSIVE);
-
-	/* Write-lock the meta page */
 	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

 	_hash_checkpage(rel, metabuf, LH_META_PAGE);
@@ -663,9 +651,6 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	/* Write out the metapage and drop lock, but keep pin */
 	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

-	/* Release split lock; okay for other splits to occur now */
-	_hash_droplock(rel, 0, HASH_EXCLUSIVE);
-
 	/* Relocate records to the new bucket */
 	_hash_splitbucket(rel, metabuf, old_bucket, new_bucket,
 					  start_oblkno, start_nblkno,
@@ -682,9 +667,6 @@ fail:

 	/* We didn't write the metapage, so just drop lock */
 	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
-
-	/* Release split lock */
-	_hash_droplock(rel, 0, HASH_EXCLUSIVE);
 }



--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -125,6 +125,8 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 	uint32		hashkey;
 	Bucket		bucket;
 	BlockNumber blkno;
+	BlockNumber oldblkno = InvalidBuffer;
+	bool		retry = false;
 	Buffer		buf;
 	Buffer		metabuf;
 	Page		page;
@@ -184,35 +186,52 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)

 	so->hashso_sk_hash = hashkey;

-	/*
-	 * Acquire shared split lock so we can compute the target bucket safely
-	 * (see README).
-	 */
-	_hash_getlock(rel, 0, HASH_SHARE);
-
 	/* Read the metapage */
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
 	metap = HashPageGetMeta(BufferGetPage(metabuf));

 	/*
-	 * Compute the target bucket number, and convert to block number.
+	 * Loop until we get a lock on the correct target bucket.
 	 */
-	bucket = _hash_hashkey2bucket(hashkey,
-								  metap->hashm_maxbucket,
-								  metap->hashm_highmask,
-								  metap->hashm_lowmask);
-
-	blkno = BUCKET_TO_BLKNO(metap, bucket);
+	for (;;)
+	{
+		/*
+		 * Compute the target bucket number, and convert to block number.
+		 */
+		bucket = _hash_hashkey2bucket(hashkey,
+									  metap->hashm_maxbucket,
+									  metap->hashm_highmask,
+									  metap->hashm_lowmask);
+
+		blkno = BUCKET_TO_BLKNO(metap, bucket);
+
+		/* Release metapage lock, but keep pin. */
+		_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
+
+		/*
+		 * If the previous iteration of this loop locked what is still the
+		 * correct target bucket, we are done.  Otherwise, drop any old lock
+		 * and lock what now appears to be the correct bucket.
+		 */
+		if (retry)
+		{
+			if (oldblkno == blkno)
+				break;
+			_hash_droplock(rel, oldblkno, HASH_SHARE);
+		}
+		_hash_getlock(rel, blkno, HASH_SHARE);
+
+		/*
+		 * Reacquire metapage lock and check that no bucket split has taken
+		 * place while we were awaiting the bucket lock.
+		 */
+		_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ);
+		oldblkno = blkno;
+		retry = true;
+	}

 	/* done with the metapage */
-	_hash_relbuf(rel, metabuf);
-
-	/*
-	 * Acquire share lock on target bucket; then we can release split lock.
-	 */
-	_hash_getlock(rel, blkno, HASH_SHARE);
-
-	_hash_droplock(rel, 0, HASH_SHARE);
+	_hash_dropbuf(rel, metabuf);

 	/* Update scan opaque state to show we have lock on the bucket */
 	so->hashso_bucket = bucket;