Improve hash index bucket split behavior.

Previously, the right to split a bucket was represented by a heavyweight lock on the page number of the primary bucket page. Unfortunately, this meant that every scan needed to take a heavyweight lock on that bucket also, which was bad for concurrency. Instead, use a cleanup lock on the primary bucket page to indicate the right to begin a split, so that scans only need to retain a pin on that page, which is they would have to acquire anyway, and which is also much cheaper. In addition to reducing the locking cost, this also avoids locking out scans and inserts for the entire lifetime of the split: while the new bucket is being populated with copies of the appropriate tuples from the old bucket, scans and inserts can happen in parallel. There are minor concurrency improvements for vacuum operations as well, though the situation there is still far from ideal. This patch also removes the unworldly assumption that a split will never be interrupted. With the new code, a split is done in a series of small steps and the system can pick up where it left off if it is interrupted prior to completion. While this patch does not itself add write-ahead logging for hash indexes, it is clearly a necessary first step, since one of the things that could interrupt a split is the removal of electrical power from the machine performing it. Amit Kapila. I wrote the original design on which this patch is based, and did a good bit of work on the comments and README through multiple rounds of review, but all of the code is Amit's. Also reviewed by Jesper Pedersen, Jeff Janes, and others. Discussion: http://postgr.es/m/CAA4eK1LfzcZYxLoXS874Ad0+S-ZM60U9bwcyiUZx9mHZ-KCWhw@mail.gmail.com

Improve hash index bucket split behavior.
Previously, the right to split a bucket was represented by a heavyweight lock on the page number of the primary bucket page. Unfortunately, this meant that every scan needed to take a heavyweight lock on that bucket also, which was bad for concurrency. Instead, use a cleanup lock on the primary bucket page to indicate the right to begin a split, so that scans only need to retain a pin on that page, which is they would have to acquire anyway, and which is also much cheaper. In addition to reducing the locking cost, this also avoids locking out scans and inserts for the entire lifetime of the split: while the new bucket is being populated with copies of the appropriate tuples from the old bucket, scans and inserts can happen in parallel. There are minor concurrency improvements for vacuum operations as well, though the situation there is still far from ideal. This patch also removes the unworldly assumption that a split will never be interrupted. With the new code, a split is done in a series of small steps and the system can pick up where it left off if it is interrupted prior to completion. While this patch does not itself add write-ahead logging for hash indexes, it is clearly a necessary first step, since one of the things that could interrupt a split is the removal of electrical power from the machine performing it. Amit Kapila. I wrote the original design on which this patch is based, and did a good bit of work on the comments and README through multiple rounds of review, but all of the code is Amit's. Also reviewed by Jesper Pedersen, Jeff Janes, and others. Discussion: http://postgr.es/m/CAA4eK1LfzcZYxLoXS874Ad0+S-ZM60U9bwcyiUZx9mHZ-KCWhw@mail.gmail.com
6d46f478 · Robert Haas · 213c0f2d · 6d46f478 · 6d46f478 · 6d46f478
Commit 6d46f478 authored Nov 30, 2016 by Robert Haas
12 changed files
--- a/src/backend/access/hash/Makefile
+++ b/src/backend/access/hash/Makefile
@@ -12,7 +12,7 @@ subdir = src/backend/access/hash
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
-OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \
+OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashsearch.o \
-       hashsearch.o hashsort.o hashutil.o hashvalidate.o
+       hashsort.o hashutil.o hashvalidate.o
 include $(top_srcdir)/src/backend/common.mk
--- a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -28,18 +28,22 @@
 void
 _hash_doinsert(Relation rel, IndexTuple itup)
 {
-	Buffer		buf;
+	Buffer		buf = InvalidBuffer;
+	Buffer		bucket_buf;
 	Buffer		metabuf;
 	HashMetaPage metap;
 	BlockNumber blkno;
-	BlockNumber oldblkno = InvalidBlockNumber;
+	BlockNumber oldblkno;
-	bool		retry = false;
+	bool		retry;
 	Page		page;
 	HashPageOpaque pageopaque;
 	Size		itemsz;
 	bool		do_expand;
 	uint32		hashkey;
 	Bucket		bucket;
+	uint32		maxbucket;
+	uint32		highmask;
+	uint32		lowmask;
 	/*
 	 * Get the hash key for the item (it's stored in the index tuple itself).
@@ -51,6 +55,7 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
 								 * need to be consistent */
+restart_insert:
 	/* Read the metapage */
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
 	metap = HashPageGetMeta(BufferGetPage(metabuf));
@@ -69,6 +74,9 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 						itemsz, HashMaxItemSize((Page) metap)),
 			errhint("Values larger than a buffer page cannot be indexed.")));
+	oldblkno = InvalidBlockNumber;
+	retry = false;
 	/*
 	 * Loop until we get a lock on the correct target bucket.
 	 */
@@ -84,21 +92,32 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 		blkno = BUCKET_TO_BLKNO(metap, bucket);
+		/*
+		 * Copy bucket mapping info now; refer the comment in
+		 * _hash_expandtable where we copy this information before calling
+		 * _hash_splitbucket to see why this is okay.
+		 */
+		maxbucket = metap->hashm_maxbucket;
+		highmask = metap->hashm_highmask;
+		lowmask = metap->hashm_lowmask;
 		/* Release metapage lock, but keep pin. */
 		_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
 		/*
-		 * If the previous iteration of this loop locked what is still the
+		 * If the previous iteration of this loop locked the primary page of
-		 * correct target bucket, we are done.  Otherwise, drop any old lock
+		 * what is still the correct target bucket, we are done.  Otherwise,
-		 * and lock what now appears to be the correct bucket.
+		 * drop any old lock before acquiring the new one.
 		 */
 		if (retry)
 		{
 			if (oldblkno == blkno)
 				break;
-			_hash_droplock(rel, oldblkno, HASH_SHARE);
+			_hash_relbuf(rel, buf);
 		}
-		_hash_getlock(rel, blkno, HASH_SHARE);
+		/* Fetch and lock the primary bucket page for the target bucket */
+		buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
 		/*
 		 * Reacquire metapage lock and check that no bucket split has taken
@@ -109,12 +128,36 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 		retry = true;
 	}
-	/* Fetch the primary bucket page for the bucket */
+	/* remember the primary bucket buffer to release the pin on it at end. */
-	buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
+	bucket_buf = buf;
 	page = BufferGetPage(buf);
 	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
 	Assert(pageopaque->hasho_bucket == bucket);
+	/*
+	 * If this bucket is in the process of being split, try to finish the
+	 * split before inserting, because that might create room for the
+	 * insertion to proceed without allocating an additional overflow page.
+	 * It's only interesting to finish the split if we're trying to insert
+	 * into the bucket from which we're removing tuples (the "old" bucket),
+	 * not if we're trying to insert into the bucket into which tuples are
+	 * being moved (the "new" bucket).
+	 */
+	if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf))
+	{
+		/* release the lock on bucket buffer, before completing the split. */
+		_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
+		_hash_finish_split(rel, metabuf, buf, pageopaque->hasho_bucket,
+						   maxbucket, highmask, lowmask);
+		/* release the pin on old and meta buffer.  retry for insert. */
+		_hash_dropbuf(rel, buf);
+		_hash_dropbuf(rel, metabuf);
+		goto restart_insert;
+	}
 	/* Do the insertion */
 	while (PageGetFreeSpace(page) < itemsz)
 	{
@@ -127,9 +170,15 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 		{
 			/*
 			 * ovfl page exists; go get it.  if it doesn't have room, we'll
-			 * find out next pass through the loop test above.
+			 * find out next pass through the loop test above.  we always
+			 * release both the lock and pin if this is an overflow page, but
+			 * only the lock if this is the primary bucket page, since the pin
+			 * on the primary bucket must be retained throughout the scan.
 			 */
-			_hash_relbuf(rel, buf);
+			if (buf != bucket_buf)
+				_hash_relbuf(rel, buf);
+			else
+				_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
 			buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
 			page = BufferGetPage(buf);
 		}
@@ -144,7 +193,7 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 			_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
 			/* chain to a new overflow page */
-			buf = _hash_addovflpage(rel, metabuf, buf);
+			buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false);
 			page = BufferGetPage(buf);
 			/* should fit now, given test above */
@@ -158,11 +207,14 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 	/* found page with enough space, so add the item here */
 	(void) _hash_pgaddtup(rel, buf, itemsz, itup);
-	/* write and release the modified page */
+	/*
+	 * write and release the modified page.  if the page we modified was an
+	 * overflow page, we also need to separately drop the pin we retained on
+	 * the primary bucket page.
+	 */
 	_hash_wrtbuf(rel, buf);
+	if (buf != bucket_buf)
-	/* We can drop the bucket lock now */
+		_hash_dropbuf(rel, bucket_buf);
-	_hash_droplock(rel, blkno, HASH_SHARE);
 	/*
 	 * Write-lock the metapage so we can increment the tuple count. After

--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
--- a/src/backend/access/hash/hashscan.c
+++ b/src/backend/access/hash/hashscan.c
-/*-------------------------------------------------------------------------
- *
- * hashscan.c
- *	  manage scans on hash tables
- *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- *	  src/backend/access/hash/hashscan.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-#include "access/hash.h"
-#include "access/relscan.h"
-#include "utils/memutils.h"
-#include "utils/rel.h"
-#include "utils/resowner.h"
-/*
- * We track all of a backend's active scans on hash indexes using a list
- * of HashScanListData structs, which are allocated in TopMemoryContext.
- * It's okay to use a long-lived context because we rely on the ResourceOwner
- * mechanism to clean up unused entries after transaction or subtransaction
- * abort.  We can't safely keep the entries in the executor's per-query
- * context, because that might be already freed before we get a chance to
- * clean up the list.  (XXX seems like there should be a better way to
- * manage this...)
- */
-typedef struct HashScanListData
-{
-	IndexScanDesc hashsl_scan;
-	ResourceOwner hashsl_owner;
-	struct HashScanListData *hashsl_next;
-} HashScanListData;
-typedef HashScanListData *HashScanList;
-static HashScanList HashScans = NULL;
-/*
- * ReleaseResources_hash() --- clean up hash subsystem resources.
- *
- * This is here because it needs to touch this module's static var HashScans.
- */
-void
-ReleaseResources_hash(void)
-{
-	HashScanList l;
-	HashScanList prev;
-	HashScanList next;
-	/*
-	 * Release all HashScanList items belonging to the current ResourceOwner.
-	 * Note that we do not release the underlying IndexScanDesc; that's in
-	 * executor memory and will go away on its own (in fact quite possibly has
-	 * gone away already, so we mustn't try to touch it here).
-	 *
-	 * Note: this should be a no-op during normal query shutdown. However, in
-	 * an abort situation ExecutorEnd is not called and so there may be open
-	 * index scans to clean up.
-	 */
-	prev = NULL;
-	for (l = HashScans; l != NULL; l = next)
-	{
-		next = l->hashsl_next;
-		if (l->hashsl_owner == CurrentResourceOwner)
-		{
-			if (prev == NULL)
-				HashScans = next;
-			else
-				prev->hashsl_next = next;
-			pfree(l);
-			/* prev does not change */
-		}
-		else
-			prev = l;
-	}
-}
-/*
- *	_hash_regscan() -- register a new scan.
- */
-void
-_hash_regscan(IndexScanDesc scan)
-{
-	HashScanList new_el;
-	new_el = (HashScanList) MemoryContextAlloc(TopMemoryContext,
-											   sizeof(HashScanListData));
-	new_el->hashsl_scan = scan;
-	new_el->hashsl_owner = CurrentResourceOwner;
-	new_el->hashsl_next = HashScans;
-	HashScans = new_el;
-}
-/*
- *	_hash_dropscan() -- drop a scan from the scan list
- */
-void
-_hash_dropscan(IndexScanDesc scan)
-{
-	HashScanList chk,
-				last;
-	last = NULL;
-	for (chk = HashScans;
-		 chk != NULL && chk->hashsl_scan != scan;
-		 chk = chk->hashsl_next)
-		last = chk;
-	if (chk == NULL)
-		elog(ERROR, "hash scan list trashed; cannot find 0x%p", (void *) scan);
-	if (last == NULL)
-		HashScans = chk->hashsl_next;
-	else
-		last->hashsl_next = chk->hashsl_next;
-	pfree(chk);
-}
-/*
- * Is there an active scan in this bucket?
- */
-bool
-_hash_has_active_scan(Relation rel, Bucket bucket)
-{
-	Oid			relid = RelationGetRelid(rel);
-	HashScanList l;
-	for (l = HashScans; l != NULL; l = l->hashsl_next)
-	{
-		if (relid == l->hashsl_scan->indexRelation->rd_id)
-		{
-			HashScanOpaque so = (HashScanOpaque) l->hashsl_scan->opaque;
-			if (so->hashso_bucket_valid &&
-				so->hashso_bucket == bucket)
-				return true;
-		}
-	}
-	return false;
-}
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@@ -20,6 +20,8 @@
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
+#define CALC_NEW_BUCKET(old_bucket, lowmask) \
+			old_bucket | (lowmask + 1)
 /*
 * _hash_checkqual -- does the index tuple satisfy the scan conditions?
@@ -352,3 +354,95 @@ _hash_binsearch_last(Page page, uint32 hash_value)
 	return lower;
 }
+/*
+ *	_hash_get_oldblock_from_newbucket() -- get the block number of a bucket
+ *			from which current (new) bucket is being split.
+ */
+BlockNumber
+_hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
+{
+	Bucket		old_bucket;
+	uint32		mask;
+	Buffer		metabuf;
+	HashMetaPage metap;
+	BlockNumber blkno;
+	/*
+	 * To get the old bucket from the current bucket, we need a mask to modulo
+	 * into lower half of table.  This mask is stored in meta page as
+	 * hashm_lowmask, but here we can't rely on the same, because we need a
+	 * value of lowmask that was prevalent at the time when bucket split was
+	 * started.  Masking the most significant bit of new bucket would give us
+	 * old bucket.
+	 */
+	mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1;
+	old_bucket = new_bucket & mask;
+	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+	metap = HashPageGetMeta(BufferGetPage(metabuf));
+	blkno = BUCKET_TO_BLKNO(metap, old_bucket);
+	_hash_relbuf(rel, metabuf);
+	return blkno;
+}
+/*
+ *	_hash_get_newblock_from_oldbucket() -- get the block number of a bucket
+ *			that will be generated after split from old bucket.
+ *
+ * This is used to find the new bucket from old bucket based on current table
+ * half.  It is mainly required to finish the incomplete splits where we are
+ * sure that not more than one bucket could have split in progress from old
+ * bucket.
+ */
+BlockNumber
+_hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
+{
+	Bucket		new_bucket;
+	Buffer		metabuf;
+	HashMetaPage metap;
+	BlockNumber blkno;
+	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+	metap = HashPageGetMeta(BufferGetPage(metabuf));
+	new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket,
+													metap->hashm_lowmask,
+													metap->hashm_maxbucket);
+	blkno = BUCKET_TO_BLKNO(metap, new_bucket);
+	_hash_relbuf(rel, metabuf);
+	return blkno;
+}
+/*
+ *	_hash_get_newbucket_from_oldbucket() -- get the new bucket that will be
+ *			generated after split from current (old) bucket.
+ *
+ * This is used to find the new bucket from old bucket.  New bucket can be
+ * obtained by OR'ing old bucket with most significant bit of current table
+ * half (lowmask passed in this function can be used to identify msb of
+ * current table half).  There could be multiple buckets that could have
+ * been split from current bucket.  We need the first such bucket that exists.
+ * Caller must ensure that no more than one split has happened from old
+ * bucket.
+ */
+Bucket
+_hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
+								   uint32 lowmask, uint32 maxbucket)
+{
+	Bucket		new_bucket;
+	new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
+	if (new_bucket > maxbucket)
+	{
+		lowmask = lowmask >> 1;
+		new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
+	}
+	return new_bucket;
+}
--- a/src/backend/utils/resowner/resowner.c
+++ b/src/backend/utils/resowner/resowner.c
@@ -668,9 +668,6 @@ ResourceOwnerReleaseInternal(ResourceOwner owner,
 				PrintFileLeakWarning(res);
 			FileClose(res);
 		}
-		/* Clean up index scans too */
-		ReleaseResources_hash();
 	}
 	/* Let add-on modules get a chance too */

--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -24,6 +24,7 @@
 #include "lib/stringinfo.h"
 #include "storage/bufmgr.h"
 #include "storage/lockdefs.h"
+#include "utils/hsearch.h"
 #include "utils/relcache.h"
 /*
@@ -32,6 +33,8 @@
 */
 typedef uint32 Bucket;
+#define InvalidBucket	((Bucket) 0xFFFFFFFF)
 #define BUCKET_TO_BLKNO(metap,B) \
 		((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1)
@@ -51,6 +54,9 @@ typedef uint32 Bucket;
 #define LH_BUCKET_PAGE			(1 << 1)
 #define LH_BITMAP_PAGE			(1 << 2)
 #define LH_META_PAGE			(1 << 3)
+#define LH_BUCKET_BEING_POPULATED	(1 << 4)
+#define LH_BUCKET_BEING_SPLIT	(1 << 5)
+#define LH_BUCKET_NEEDS_SPLIT_CLEANUP	(1 << 6)
 typedef struct HashPageOpaqueData
 {
@@ -63,6 +69,10 @@ typedef struct HashPageOpaqueData
 typedef HashPageOpaqueData *HashPageOpaque;
+#define H_NEEDS_SPLIT_CLEANUP(opaque)	((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP)
+#define H_BUCKET_BEING_SPLIT(opaque)	((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT)
+#define H_BUCKET_BEING_POPULATED(opaque)	((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED)
 /*
 * The page ID is for the convenience of pg_filedump and similar utilities,
 * which otherwise would have a hard time telling pages of different index
@@ -79,19 +89,6 @@ typedef struct HashScanOpaqueData
 	/* Hash value of the scan key, ie, the hash key we seek */
 	uint32		hashso_sk_hash;
-	/*
-	 * By definition, a hash scan should be examining only one bucket. We
-	 * record the bucket number here as soon as it is known.
-	 */
-	Bucket		hashso_bucket;
-	bool		hashso_bucket_valid;
-	/*
-	 * If we have a share lock on the bucket, we record it here.  When
-	 * hashso_bucket_blkno is zero, we have no such lock.
-	 */
-	BlockNumber hashso_bucket_blkno;
 	/*
 	 * We also want to remember which buffer we're currently examining in the
 	 * scan. We keep the buffer pinned (but not locked) across hashgettuple
@@ -100,11 +97,30 @@ typedef struct HashScanOpaqueData
 	 */
 	Buffer		hashso_curbuf;
+	/* remember the buffer associated with primary bucket */
+	Buffer		hashso_bucket_buf;
+	/*
+	 * remember the buffer associated with primary bucket page of bucket being
+	 * split.  it is required during the scan of the bucket which is being
+	 * populated during split operation.
+	 */
+	Buffer		hashso_split_bucket_buf;
 	/* Current position of the scan, as an index TID */
 	ItemPointerData hashso_curpos;
 	/* Current position of the scan, as a heap TID */
 	ItemPointerData hashso_heappos;
+	/* Whether scan starts on bucket being populated due to split */
+	bool		hashso_buc_populated;
+	/*
+	 * Whether scanning bucket being split?  The value of this parameter is
+	 * referred only when hashso_buc_populated is true.
+	 */
+	bool		hashso_buc_split;
 } HashScanOpaqueData;
 typedef HashScanOpaqueData *HashScanOpaque;
@@ -175,6 +191,8 @@ typedef HashMetaPageData *HashMetaPage;
 				  sizeof(ItemIdData) - \
 				  MAXALIGN(sizeof(HashPageOpaqueData)))
+#define INDEX_MOVED_BY_SPLIT_MASK	0x2000
 #define HASH_MIN_FILLFACTOR			10
 #define HASH_DEFAULT_FILLFACTOR		75
@@ -223,9 +241,6 @@ typedef HashMetaPageData *HashMetaPage;
 #define HASH_WRITE		BUFFER_LOCK_EXCLUSIVE
 #define HASH_NOLOCK		(-1)
-#define HASH_SHARE		ShareLock
-#define HASH_EXCLUSIVE	ExclusiveLock
 /*
 *	Strategy number. There's only one valid strategy for hashing: equality.
 */
@@ -297,21 +312,21 @@ extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf,
 			   Size itemsize, IndexTuple itup);
 /* hashovfl.c */
-extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf);
+extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin);
-extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf,
+extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, Buffer wbuf,
-				   BufferAccessStrategy bstrategy);
+				   bool wbuf_dirty, BufferAccessStrategy bstrategy);
 extern void _hash_initbitmap(Relation rel, HashMetaPage metap,
 				 BlockNumber blkno, ForkNumber forkNum);
 extern void _hash_squeezebucket(Relation rel,
 					Bucket bucket, BlockNumber bucket_blkno,
+					Buffer bucket_buf,
 					BufferAccessStrategy bstrategy);
 /* hashpage.c */
-extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access);
-extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access);
-extern void _hash_droplock(Relation rel, BlockNumber whichlock, int access);
 extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno,
 			 int access, int flags);
+extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel,
+								   BlockNumber blkno, int flags);
 extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
 extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno,
 				ForkNumber forkNum);
@@ -320,6 +335,7 @@ extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
 						   BufferAccessStrategy bstrategy);
 extern void _hash_relbuf(Relation rel, Buffer buf);
 extern void _hash_dropbuf(Relation rel, Buffer buf);
+extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so);
 extern void _hash_wrtbuf(Relation rel, Buffer buf);
 extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
 				   int to_access);
@@ -327,12 +343,9 @@ extern uint32 _hash_metapinit(Relation rel, double num_tuples,
 				ForkNumber forkNum);
 extern void _hash_pageinit(Page page, Size size);
 extern void _hash_expandtable(Relation rel, Buffer metabuf);
+extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf,
-/* hashscan.c */
+				   Bucket obucket, uint32 maxbucket, uint32 highmask,
-extern void _hash_regscan(IndexScanDesc scan);
+				   uint32 lowmask);
-extern void _hash_dropscan(IndexScanDesc scan);
-extern bool _hash_has_active_scan(Relation rel, Bucket bucket);
-extern void ReleaseResources_hash(void);
 /* hashsearch.c */
 extern bool _hash_next(IndexScanDesc scan, ScanDirection dir);
@@ -362,5 +375,18 @@ extern bool _hash_convert_tuple(Relation index,
 					Datum *index_values, bool *index_isnull);
 extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
 extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
+extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket);
+extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket);
+extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
+								   uint32 lowmask, uint32 maxbucket);
+/* hash.c */
+extern void hashbucketcleanup(Relation rel, Bucket cur_bucket,
+				  Buffer bucket_buf, BlockNumber bucket_blkno,
+				  BufferAccessStrategy bstrategy,
+				  uint32 maxbucket, uint32 highmask, uint32 lowmask,
+				  double *tuples_removed, double *num_index_tuples,
+				  bool bucket_has_garbage,
+				  IndexBulkDeleteCallback callback, void *callback_state);
 #endif   /* HASH_H */
--- a/src/include/access/itup.h
+++ b/src/include/access/itup.h
@@ -63,7 +63,7 @@ typedef IndexAttributeBitMapData *IndexAttributeBitMap;
 * t_info manipulation macros
 */
 #define INDEX_SIZE_MASK 0x1FFF
-/* bit 0x2000 is not used at present */
+/* bit 0x2000 is reserved for index-AM specific usage */
 #define INDEX_VAR_MASK	0x4000
 #define INDEX_NULL_MASK 0x8000