Reimplement hash index locking algorithms, per my recent proposal to

pghackers. This fixes the problem recently reported by Markus KrÌutner (hash bucket split corrupts the state of scans being done concurrently), and I believe it also fixes all the known problems with deadlocks in hash index operations. Hash indexes are still not really ready for prime time (since they aren't WAL-logged), but this is a step forward.

Reimplement hash index locking algorithms, per my recent proposal to
pghackers. This fixes the problem recently reported by Markus KrÌutner (hash bucket split corrupts the state of scans being done concurrently), and I believe it also fixes all the known problems with deadlocks in hash index operations. Hash indexes are still not really ready for prime time (since they aren't WAL-logged), but this is a step forward.
7a369371 · Tom Lane · ca43f71c · 7a369371 · 7a369371 · 7a369371
Commit 7a369371 authored Sep 04, 2003 by Tom Lane
11 changed files
--- a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
-$Header: /cvsroot/pgsql/src/backend/access/hash/README,v 1.2 2003/09/02 03:29:01 tgl Exp $
+$Header: /cvsroot/pgsql/src/backend/access/hash/README,v 1.3 2003/09/04 22:06:27 tgl Exp $

 This directory contains an implementation of hash indexing for Postgres.

@@ -229,8 +229,8 @@ existing bucket in two, thereby lowering the fill ratio:
 	check split still needed
 	if split not needed anymore, drop locks and exit
 	decide which bucket to split
-	Attempt to X-lock new bucket number (shouldn't fail, but...)
 	Attempt to X-lock old bucket number (definitely could fail)
+	Attempt to X-lock new bucket number (shouldn't fail, but...)
 	if above fail, drop locks and exit
 	update meta page to reflect new number of buckets
 	write/release meta page
@@ -261,12 +261,6 @@ not be overfull and split attempts will stop.  (We could make a successful
 splitter loop to see if the index is still overfull, but it seems better to
 distribute the split overhead across successive insertions.)

-It may be wise to make the initial exclusive-lock-page-zero operation a
-conditional one as well, although the odds of a deadlock failure are quite
-low.  (AFAICS it could only deadlock against a VACUUM operation that is
-trying to X-lock a bucket that the current process has a stopped indexscan
-in.)
-
 A problem is that if a split fails partway through (eg due to insufficient
 disk space) the index is left corrupt.  The probability of that could be
 made quite low if we grab a free page or two before we update the meta

--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
--- a/src/backend/access/hash/hashscan.c
+++ b/src/backend/access/hash/hashscan.c
@@ -8,22 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashscan.c,v 1.30 2003/08/04 02:39:57 momjian Exp $
- *
- * NOTES
- *	  Because we can be doing an index scan on a relation while we
- *	  update it, we need to avoid missing data that moves around in
- *	  the index.  The routines and global variables in this file
- *	  guarantee that all scans in the local address space stay
- *	  correctly positioned.  This is all we need to worry about, since
- *	  write locking guarantees that no one else will be on the same
- *	  page at the same time as we are.
- *
- *	  The scheme is to manage a list of active scans in the current
- *	  backend.	Whenever we add or remove records from an index, we
- *	  check the list of active scans to see if any has been affected.
- *	  A scan is affected only if it is on the same relation, and the
- *	  same page, as the update.
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashscan.c,v 1.31 2003/09/04 22:06:27 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -44,10 +29,6 @@ typedef HashScanListData *HashScanList;
 static HashScanList HashScans = (HashScanList) NULL;


-static void _hash_scandel(IndexScanDesc scan,
-			  BlockNumber blkno, OffsetNumber offno);
-
-
 /*
 * AtEOXact_hash() --- clean up hash subsystem at xact abort or commit.
 *
@@ -67,9 +48,6 @@ AtEOXact_hash(void)
 	 * at end of transaction anyway.
 	 */
 	HashScans = NULL;
-
-	/* If we were building a hash, we ain't anymore. */
-	BuildingHash = false;
 }

 /*
@@ -112,70 +90,26 @@ _hash_dropscan(IndexScanDesc scan)
 	pfree(chk);
 }

-void
-_hash_adjscans(Relation rel, ItemPointer tid)
+/*
+ * Is there an active scan in this bucket?
+ */
+bool
+_hash_has_active_scan(Relation rel, Bucket bucket)
 {
+	Oid			relid = RelationGetRelid(rel);
 	HashScanList l;
-	Oid			relid;

-	relid = RelationGetRelid(rel);
-	for (l = HashScans; l != (HashScanList) NULL; l = l->hashsl_next)
+	for (l = HashScans; l != NULL; l = l->hashsl_next)
 	{
 		if (relid == l->hashsl_scan->indexRelation->rd_id)
-			_hash_scandel(l->hashsl_scan, ItemPointerGetBlockNumber(tid),
-						  ItemPointerGetOffsetNumber(tid));
-	}
-}
+		{
+			HashScanOpaque so = (HashScanOpaque) l->hashsl_scan->opaque;

-static void
-_hash_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
-{
-	ItemPointer current;
-	ItemPointer mark;
-	Buffer		buf;
-	Buffer		metabuf;
-	HashScanOpaque so;
-
-	so = (HashScanOpaque) scan->opaque;
-	current = &(scan->currentItemData);
-	mark = &(scan->currentMarkData);
-
-	if (ItemPointerIsValid(current)
-		&& ItemPointerGetBlockNumber(current) == blkno
-		&& ItemPointerGetOffsetNumber(current) >= offno)
-	{
-		metabuf = _hash_getbuf(scan->indexRelation, HASH_METAPAGE, HASH_READ);
-		buf = so->hashso_curbuf;
-		_hash_step(scan, &buf, BackwardScanDirection, metabuf);
+			if (so->hashso_bucket_valid &&
+				so->hashso_bucket == bucket)
+				return true;
+		}
 	}

-	if (ItemPointerIsValid(mark)
-		&& ItemPointerGetBlockNumber(mark) == blkno
-		&& ItemPointerGetOffsetNumber(mark) >= offno)
-	{
-		/*
-		 * The idea here is to exchange the current and mark positions,
-		 * then step backwards (affecting current), then exchange again.
-		 */
-		ItemPointerData tmpitem;
-		Buffer		tmpbuf;
-
-		tmpitem = *mark;
-		*mark = *current;
-		*current = tmpitem;
-		tmpbuf = so->hashso_mrkbuf;
-		so->hashso_mrkbuf = so->hashso_curbuf;
-		so->hashso_curbuf = tmpbuf;
-
-		metabuf = _hash_getbuf(scan->indexRelation, HASH_METAPAGE, HASH_READ);
-		buf = so->hashso_curbuf;
-		_hash_step(scan, &buf, BackwardScanDirection, metabuf);
-
-		tmpitem = *mark;
-		*mark = *current;
-		*current = tmpitem;
-		tmpbuf = so->hashso_mrkbuf;
-		so->hashso_mrkbuf = so->hashso_curbuf;
-		so->hashso_curbuf = tmpbuf;
-	}
+	return false;
 }
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashutil.c,v 1.35 2003/09/02 18:13:31 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashutil.c,v 1.36 2003/09/04 22:06:27 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -19,46 +19,6 @@
 #include "access/iqual.h"


-/*
- * _hash_mkscankey -- build a scan key matching the given indextuple
- *
- * Note: this is prepared for multiple index columns, but very little
- * else in access/hash is ...
- */
-ScanKey
-_hash_mkscankey(Relation rel, IndexTuple itup)
-{
-	ScanKey		skey;
-	TupleDesc	itupdesc = RelationGetDescr(rel);
-	int			natts = rel->rd_rel->relnatts;
-	AttrNumber	i;
-	Datum		arg;
-	FmgrInfo   *procinfo;
-	bool		isnull;
-
-	skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
-
-	for (i = 0; i < natts; i++)
-	{
-		arg = index_getattr(itup, i + 1, itupdesc, &isnull);
-		procinfo = index_getprocinfo(rel, i + 1, HASHPROC);
-		ScanKeyEntryInitializeWithInfo(&skey[i],
-									   isnull ? SK_ISNULL : 0x0,
-									   (AttrNumber) (i + 1),
-									   procinfo,
-									   CurrentMemoryContext,
-									   arg);
-	}
-
-	return skey;
-}
-
-void
-_hash_freeskey(ScanKey skey)
-{
-	pfree(skey);
-}
-
 /*
 * _hash_checkqual -- does the index tuple satisfy the scan conditions?
 */
@@ -102,24 +62,31 @@ _hash_formitem(IndexTuple itup)
 }

 /*
- * _hash_call -- given a Datum, call the index's hash procedure
- *
- * Returns the bucket number that the hash key maps to.
+ * _hash_datum2hashkey -- given a Datum, call the index's hash procedure
 */
-Bucket
-_hash_call(Relation rel, HashMetaPage metap, Datum key)
+uint32
+_hash_datum2hashkey(Relation rel, Datum key)
 {
 	FmgrInfo   *procinfo;
-	uint32		n;
-	Bucket		bucket;

 	/* XXX assumes index has only one attribute */
 	procinfo = index_getprocinfo(rel, 1, HASHPROC);
-	n = DatumGetUInt32(FunctionCall1(procinfo, key));

-	bucket = n & metap->hashm_highmask;
-	if (bucket > metap->hashm_maxbucket)
-		bucket = bucket & metap->hashm_lowmask;
+	return DatumGetUInt32(FunctionCall1(procinfo, key));
+}
+
+/*
+ * _hash_hashkey2bucket -- determine which bucket the hashkey maps to.
+ */
+Bucket
+_hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
+					 uint32 highmask, uint32 lowmask)
+{
+	Bucket		bucket;
+
+	bucket = hashkey & highmask;
+	if (bucket > maxbucket)
+		bucket = bucket & lowmask;

 	return bucket;
 }

--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lmgr.c,v 1.59 2003/08/17 22:41:12 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lmgr.c,v 1.60 2003/09/04 22:06:27 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -153,7 +153,7 @@ LockRelation(Relation relation, LOCKMODE lockmode)
 * As above, but only lock if we can get the lock without blocking.
 * Returns TRUE iff the lock was acquired.
 *
- * NOTE: we do not currently need conditional versions of the other
+ * NOTE: we do not currently need conditional versions of all the
 * LockXXX routines in this file, but they could easily be added if needed.
 */
 bool
@@ -264,6 +264,26 @@ LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
 		elog(ERROR, "LockAcquire failed");
 }

+/*
+ *		ConditionalLockPage
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns TRUE iff the lock was acquired.
+ */
+bool
+ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	MemSet(&tag, 0, sizeof(tag));
+	tag.relId = relation->rd_lockInfo.lockRelId.relId;
+	tag.dbId = relation->rd_lockInfo.lockRelId.dbId;
+	tag.objId.blkno = blkno;
+
+	return LockAcquire(LockTableId, &tag, GetCurrentTransactionId(),
+					   lockmode, true);
+}
+
 /*
 *		UnlockPage
 */

--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: hash.h,v 1.52 2003/09/02 18:13:32 tgl Exp $
+ * $Id: hash.h,v 1.53 2003/09/04 22:06:27 tgl Exp $
 *
 * NOTES
 *		modeled after Margo Seltzer's hash implementation for unix.
@@ -70,13 +70,27 @@ typedef HashPageOpaqueData *HashPageOpaque;
 #define HASHO_FILL		0x1234

 /*
- *	ScanOpaqueData is used to remember which buffers we're currently
- *	examining in the scan.	We keep these buffers locked and pinned and
- *	recorded in the opaque entry of the scan in order to avoid doing a
- *	ReadBuffer() for every tuple in the index.
+ *	HashScanOpaqueData is private state for a hash index scan.
 */
 typedef struct HashScanOpaqueData
 {
+	/*
+	 * By definition, a hash scan should be examining only one bucket.
+	 * We record the bucket number here as soon as it is known.
+	 */
+	Bucket		hashso_bucket;
+	bool		hashso_bucket_valid;
+	/*
+	 * If we have a share lock on the bucket, we record it here.  When
+	 * hashso_bucket_blkno is zero, we have no such lock.
+	 */
+	BlockNumber	hashso_bucket_blkno;
+	/*
+	 * We also want to remember which buffers we're currently examining in the
+	 * scan. We keep these buffers pinned (but not locked) across hashgettuple
+	 * calls, in order to avoid doing a ReadBuffer() for every tuple in the
+	 * index.
+	 */
 	Buffer		hashso_curbuf;
 	Buffer		hashso_mrkbuf;
 } HashScanOpaqueData;
@@ -148,10 +162,18 @@ typedef struct HashItemData

 typedef HashItemData *HashItem;

+/*
+ * Maximum size of a hash index item (it's okay to have only one per page)
+ */
+#define HashMaxItemSize(page) \
+	(PageGetPageSize(page) - \
+	 sizeof(PageHeaderData) - \
+	 MAXALIGN(sizeof(HashPageOpaqueData)) - \
+	 sizeof(ItemIdData))
+
 /*
 * Constants
 */
-#define DEFAULT_FFACTOR			300
 #define BYTE_TO_BIT				3		/* 2^3 bits/byte */
 #define ALL_SET					((uint32) ~0)

@@ -180,10 +202,14 @@ typedef HashItemData *HashItem;
 #define ISSET(A, N)		((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP)))

 /*
- * page locking modes
+ * page-level and high-level locking modes (see README)
 */
-#define HASH_READ		0
-#define HASH_WRITE		1
+#define HASH_READ		BUFFER_LOCK_SHARE
+#define HASH_WRITE		BUFFER_LOCK_EXCLUSIVE
+#define HASH_NOLOCK		(-1)
+
+#define HASH_SHARE		ShareLock
+#define HASH_EXCLUSIVE	ExclusiveLock

 /*
 *	Strategy number. There's only one valid strategy for hashing: equality.
@@ -199,8 +225,6 @@ typedef HashItemData *HashItem;
 #define HASHPROC		1


-extern bool BuildingHash;
-
 /* public routines */

 extern Datum hashbuild(PG_FUNCTION_ARGS);
@@ -250,36 +274,37 @@ extern void _hash_squeezebucket(Relation rel,
 								Bucket bucket, BlockNumber bucket_blkno);

 /* hashpage.c */
-extern void _hash_metapinit(Relation rel);
+extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access);
+extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access);
+extern void _hash_droplock(Relation rel, BlockNumber whichlock, int access);
 extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access);
-extern void _hash_relbuf(Relation rel, Buffer buf, int access);
+extern void _hash_relbuf(Relation rel, Buffer buf);
+extern void _hash_dropbuf(Relation rel, Buffer buf);
 extern void _hash_wrtbuf(Relation rel, Buffer buf);
-extern void _hash_wrtnorelbuf(Buffer buf);
+extern void _hash_wrtnorelbuf(Relation rel, Buffer buf);
 extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
 				   int to_access);
+extern void _hash_metapinit(Relation rel);
 extern void _hash_pageinit(Page page, Size size);
 extern void _hash_expandtable(Relation rel, Buffer metabuf);

 /* hashscan.c */
 extern void _hash_regscan(IndexScanDesc scan);
 extern void _hash_dropscan(IndexScanDesc scan);
-extern void _hash_adjscans(Relation rel, ItemPointer tid);
+extern bool _hash_has_active_scan(Relation rel, Bucket bucket);
 extern void AtEOXact_hash(void);

 /* hashsearch.c */
-extern void _hash_search(Relation rel, int keysz, ScanKey scankey,
-			 Buffer *bufP, HashMetaPage metap);
 extern bool _hash_next(IndexScanDesc scan, ScanDirection dir);
 extern bool _hash_first(IndexScanDesc scan, ScanDirection dir);
-extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir,
-		   Buffer metabuf);
+extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);

 /* hashutil.c */
-extern ScanKey _hash_mkscankey(Relation rel, IndexTuple itup);
-extern void _hash_freeskey(ScanKey skey);
 extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup);
 extern HashItem _hash_formitem(IndexTuple itup);
-extern Bucket _hash_call(Relation rel, HashMetaPage metap, Datum key);
+extern uint32 _hash_datum2hashkey(Relation rel, Datum key);
+extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
+								   uint32 highmask, uint32 lowmask);
 extern uint32 _hash_log2(uint32 num);
 extern void _hash_checkpage(Relation rel, Page page, int flags);


--- a/src/include/storage/lmgr.h
+++ b/src/include/storage/lmgr.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: lmgr.h,v 1.39 2003/08/04 02:40:14 momjian Exp $
+ * $Id: lmgr.h,v 1.40 2003/09/04 22:06:27 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -54,8 +54,9 @@ extern void UnlockRelation(Relation relation, LOCKMODE lockmode);
 extern void LockRelationForSession(LockRelId *relid, LOCKMODE lockmode);
 extern void UnlockRelationForSession(LockRelId *relid, LOCKMODE lockmode);

-/* Lock a page (mainly used for indices) */
+/* Lock a page (mainly used for indexes) */
 extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
+extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
 extern void UnlockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);

 /* Lock an XID (used to wait for a transaction to finish) */