Add support to dynahash.c for partitioning shared hashtables according

to the low-order bits of the entry hash value. Also make some incidental cleanups in the dynahash API, such as not exporting the hash header structs to the world.

Add support to dynahash.c for partitioning shared hashtables according
to the low-order bits of the entry hash value. Also make some incidental cleanups in the dynahash API, such as not exporting the hash header structs to the world.
51ee9fa1 · Tom Lane · c0e9b313 · 51ee9fa1 · 51ee9fa1 · 51ee9fa1
Commit 51ee9fa1 authored Jul 22, 2006 by Tom Lane
4 changed files
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/ipc/shmem.c,v 1.93 2006/07/14 14:52:22 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/ipc/shmem.c,v 1.94 2006/07/22 23:04:39 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -211,9 +211,6 @@ InitShmemIndex(void)
 {
 	HASHCTL		info;
 	int			hash_flags;
-	ShmemIndexEnt *result,
-				item;
-	bool		found;
 	/*
 	 * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex
@@ -227,32 +224,11 @@ InitShmemIndex(void)
 	info.entrysize = sizeof(ShmemIndexEnt);
 	hash_flags = HASH_ELEM;
-	/* This will acquire the shmem index lock, but not release it. */
 	ShmemIndex = ShmemInitHash("ShmemIndex",
 							   SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
 							   &info, hash_flags);
 	if (!ShmemIndex)
 		elog(FATAL, "could not initialize Shmem Index");
-	/*
-	 * Now, create an entry in the hashtable for the index itself.
-	 */
-	if (!IsUnderPostmaster)
-	{
-		MemSet(item.key, 0, SHMEM_INDEX_KEYSIZE);
-		strncpy(item.key, "ShmemIndex", SHMEM_INDEX_KEYSIZE);
-		result = (ShmemIndexEnt *)
-			hash_search(ShmemIndex, (void *) &item, HASH_ENTER, &found);
-		Assert(!found);
-		result->location = MAKE_OFFSET(ShmemIndex->hctl);
-		result->size = SHMEM_INDEX_SIZE;
-	}
-	/* now release the lock acquired in ShmemInitStruct */
-	LWLockRelease(ShmemIndexLock);
 }
 /*
@@ -295,7 +271,7 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
 	/* look it up in the shmem index */
 	location = ShmemInitStruct(name,
-						sizeof(HASHHDR) + infoP->dsize * sizeof(HASHSEGMENT),
+							   hash_get_shared_size(infoP, hash_flags),
 							   &found);
 	/*
@@ -312,9 +288,8 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
 	if (found)
 		hash_flags |= HASH_ATTACH;
-	/* Now provide the header and directory pointers */
+	/* Pass location of hashtable header to hash_create */
 	infoP->hctl = (HASHHDR *) location;
-	infoP->dir = (HASHSEGMENT *) (((char *) location) + sizeof(HASHHDR));
 	return hash_create(name, init_size, infoP, hash_flags);
 }
@@ -363,14 +338,16 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
 			 * If the shmem index doesn't exist, we are bootstrapping: we must
 			 * be trying to init the shmem index itself.
 			 *
-			 * Notice that the ShmemIndexLock is held until the shmem index
+			 * Notice that the ShmemIndexLock is released before the shmem
-			 * has been completely initialized.
+			 * index has been initialized.  This should be OK because no
+			 * other process can be accessing shared memory yet.
 			 */
 			Assert(shmemseghdr->indexoffset == 0);
 			structPtr = ShmemAlloc(size);
 			shmemseghdr->indexoffset = MAKE_OFFSET(structPtr);
 			*foundPtr = FALSE;
 		}
+		LWLockRelease(ShmemIndexLock);
 		return structPtr;
 	}

--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.166 2006/07/14 14:52:23 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.167 2006/07/22 23:04:39 tgl Exp $
 *
 * NOTES
 *	  A lock table is a shared memory hash table.  When
@@ -1958,7 +1958,7 @@ GetLockStatusData(void)
 	{
 		LWLockAcquire(FirstLockMgrLock + i, LW_SHARED);
 		proclockTable = LockMethodProcLockHash[i];
-		els += proclockTable->hctl->nentries;
+		els += hash_get_num_entries(proclockTable);
 	}
 	data->nelements = els;

--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -3,17 +3,36 @@
 * dynahash.c
 *	  dynamic hash tables
 *
+ * dynahash.c supports both local-to-a-backend hash tables and hash tables in
+ * shared memory.  For shared hash tables, it is the caller's responsibility
+ * to provide appropriate access interlocking.  The simplest convention is
+ * that a single LWLock protects the whole hash table.  Searches (HASH_FIND or
+ * hash_seq_search) need only shared lock, but any update requires exclusive
+ * lock.  For heavily-used shared tables, the single-lock approach creates a
+ * concurrency bottleneck, so we also support "partitioned" locking wherein
+ * there are multiple LWLocks guarding distinct subsets of the table.  To use
+ * a hash table in partitioned mode, the HASH_PARTITION flag must be given
+ * to hash_create.  This prevents any attempt to split buckets on-the-fly.
+ * Therefore, each hash bucket chain operates independently, and no fields
+ * of the hash header change after init except nentries and freeList.
+ * A partitioned table uses a spinlock to guard changes of those two fields.
+ * This lets any subset of the hash buckets be treated as a separately
+ * lockable partition.  We expect callers to use the low-order bits of a
+ * lookup key's hash value as a partition number --- this will work because
+ * of the way calc_bucket() maps hash values to bucket numbers.
 *
 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.69 2006/07/14 14:52:25 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.70 2006/07/22 23:04:39 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
 /*
+ * Original comments:
 *
 * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
 * Coded into C, with minor code improvements, and with hsearch(3) interface,
@@ -45,9 +64,107 @@
 #include "postgres.h"
 #include "storage/shmem.h"
+#include "storage/spin.h"
 #include "utils/dynahash.h"
 #include "utils/memutils.h"
+/*
+ * Constants
+ *
+ * A hash table has a top-level "directory", each of whose entries points
+ * to a "segment" of ssize bucket headers.	The maximum number of hash
+ * buckets is thus dsize * ssize (but dsize may be expansible).  Of course,
+ * the number of records in the table can be larger, but we don't want a
+ * whole lot of records per bucket or performance goes down.
+ *
+ * In a hash table allocated in shared memory, the directory cannot be
+ * expanded because it must stay at a fixed address.  The directory size
+ * should be selected using hash_select_dirsize (and you'd better have
+ * a good idea of the maximum number of entries!).	For non-shared hash
+ * tables, the initial directory size can be left at the default.
+ */
+#define DEF_SEGSIZE			   256
+#define DEF_SEGSIZE_SHIFT	   8	/* must be log2(DEF_SEGSIZE) */
+#define DEF_DIRSIZE			   256
+#define DEF_FFACTOR			   1	/* default fill factor */
+/* A hash bucket is a linked list of HASHELEMENTs */
+typedef HASHELEMENT *HASHBUCKET;
+/* A hash segment is an array of bucket headers */
+typedef HASHBUCKET *HASHSEGMENT;
+/*
+ * Header structure for a hash table --- contains all changeable info
+ *
+ * In a shared-memory hash table, the HASHHDR is in shared memory, while
+ * each backend has a local HTAB struct.  For a non-shared table, there isn't
+ * any functional difference between HASHHDR and HTAB, but we separate them
+ * anyway to share code between shared and non-shared tables.
+ */
+struct HASHHDR
+{
+	/* In a partitioned table, take this lock to touch nentries or freeList */
+	slock_t		mutex;			/* unused if not partitioned table */
+	/* These fields change during entry addition/deletion */
+	long		nentries;		/* number of entries in hash table */
+	HASHELEMENT *freeList;		/* linked list of free elements */
+	/* These fields can change, but not in a partitioned table */
+	/* Also, dsize can't change in a shared table, even if unpartitioned */
+	long		dsize;			/* directory size */
+	long		nsegs;			/* number of allocated segments (<= dsize) */
+	uint32		max_bucket;		/* ID of maximum bucket in use */
+	uint32		high_mask;		/* mask to modulo into entire table */
+	uint32		low_mask;		/* mask to modulo into lower half of table */
+	/* These fields are fixed at hashtable creation */
+	Size		keysize;		/* hash key length in bytes */
+	Size		entrysize;		/* total user element size in bytes */
+	long		num_partitions;	/* # partitions (must be power of 2), or 0 */
+	long		ffactor;		/* target fill factor */
+	long		max_dsize;		/* 'dsize' limit if directory is fixed size */
+	long		ssize;			/* segment size --- must be power of 2 */
+	int			sshift;			/* segment shift = log2(ssize) */
+	int			nelem_alloc;	/* number of entries to allocate at once */
+#ifdef HASH_STATISTICS
+	/*
+	 * Count statistics here.  NB: stats code doesn't bother with mutex,
+	 * so counts could be corrupted a bit in a partitioned table.
+	 */
+	long		accesses;
+	long		collisions;
+#endif
+};
+#define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)
+/*
+ * Top control structure for a hashtable --- in a shared table, each backend
+ * has its own copy (OK since no fields change at runtime)
+ */
+struct HTAB
+{
+	HASHHDR    *hctl;			/* => shared control information */
+	HASHSEGMENT *dir;			/* directory of segment starts */
+	HashValueFunc hash;			/* hash function */
+	HashCompareFunc match;		/* key comparison function */
+	HashCopyFunc keycopy;		/* key copying function */
+	HashAllocFunc alloc;		/* memory allocator */
+	MemoryContext hcxt;			/* memory context if default allocator used */
+	char	   *tabname;		/* table name (for error messages) */
+	bool		isshared;		/* true if table is in shared memory */
+	/* We keep local copies of these fixed values to reduce contention */
+	Size		keysize;		/* hash key length in bytes */
+	long		ssize;			/* segment size --- must be power of 2 */
+	int			sshift;			/* segment shift = log2(ssize) */
+};
 /*
 * Key (also entry) part of a HASHELEMENT
 */
@@ -58,6 +175,12 @@
 */
 #define MOD(x,y)			   ((x) & ((y)-1))
+#if HASH_STATISTICS
+static long hash_accesses,
+			hash_collisions,
+			hash_expansions;
+#endif
 /*
 * Private function prototypes
 */
@@ -66,6 +189,7 @@ static HASHSEGMENT seg_alloc(HTAB *hashp);
 static bool element_alloc(HTAB *hashp, int nelem);
 static bool dir_realloc(HTAB *hashp);
 static bool expand_table(HTAB *hashp);
+static HASHBUCKET get_hash_entry(HTAB *hashp);
 static void hdefault(HTAB *hashp);
 static int	choose_nelem_alloc(Size entrysize);
 static bool init_htab(HTAB *hashp, long nelem);
@@ -85,13 +209,6 @@ DynaHashAlloc(Size size)
 }
-#if HASH_STATISTICS
-static long hash_accesses,
-			hash_collisions,
-			hash_expansions;
-#endif
 /************************** CREATE ROUTINES **********************/
 /*
@@ -185,17 +302,26 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 	if (flags & HASH_SHARED_MEM)
 	{
 		/*
-		 * ctl structure is preallocated for shared memory tables. Note that
+		 * ctl structure and directory are preallocated for shared memory
-		 * HASH_DIRSIZE and HASH_ALLOC had better be set as well.
+		 * tables.  Note that HASH_DIRSIZE and HASH_ALLOC had better be set
+		 * as well.
 		 */
 		hashp->hctl = info->hctl;
-		hashp->dir = info->dir;
+		hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
 		hashp->hcxt = NULL;
 		hashp->isshared = true;
 		/* hash table already exists, we're just attaching to it */
 		if (flags & HASH_ATTACH)
+		{
+			/* make local copies of some heavily-used values */
+			hctl = hashp->hctl;
+			hashp->keysize = hctl->keysize;
+			hashp->ssize = hctl->ssize;
+			hashp->sshift = hctl->sshift;
 			return hashp;
+		}
 	}
 	else
 	{
@@ -218,9 +344,16 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 	hdefault(hashp);
 	hctl = hashp->hctl;
-#ifdef HASH_STATISTICS
-	hctl->accesses = hctl->collisions = 0;
+	if (flags & HASH_PARTITION)
-#endif
+	{
+		/* Doesn't make sense to partition a local hash table */
+		Assert(flags & HASH_SHARED_MEM);
+		/* # of partitions had better be a power of 2 */
+		Assert(info->num_partitions == (1L << my_log2(info->num_partitions)));
+		hctl->num_partitions = info->num_partitions;
+	}
 	if (flags & HASH_SEGMENT)
 	{
@@ -252,6 +385,11 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 		hctl->entrysize = info->entrysize;
 	}
+	/* make local copies of heavily-used constant fields */
+	hashp->keysize = hctl->keysize;
+	hashp->ssize = hctl->ssize;
+	hashp->sshift = hctl->sshift;
 	/* Build the hash directory structure */
 	if (!init_htab(hashp, nelem))
 	{
@@ -292,22 +430,29 @@ hdefault(HTAB *hashp)
 	MemSet(hctl, 0, sizeof(HASHHDR));
-	hctl->ssize = DEF_SEGSIZE;
-	hctl->sshift = DEF_SEGSIZE_SHIFT;
-	hctl->dsize = DEF_DIRSIZE;
-	hctl->ffactor = DEF_FFACTOR;
 	hctl->nentries = 0;
+	hctl->freeList = NULL;
+	hctl->dsize = DEF_DIRSIZE;
 	hctl->nsegs = 0;
 	/* rather pointless defaults for key & entry size */
 	hctl->keysize = sizeof(char *);
 	hctl->entrysize = 2 * sizeof(char *);
+	hctl->num_partitions = 0;	/* not partitioned */
+	hctl->ffactor = DEF_FFACTOR;
 	/* table has no fixed maximum size */
 	hctl->max_dsize = NO_MAX_DSIZE;
-	/* garbage collection for HASH_REMOVE */
+	hctl->ssize = DEF_SEGSIZE;
-	hctl->freeList = NULL;
+	hctl->sshift = DEF_SEGSIZE_SHIFT;
+#ifdef HASH_STATISTICS
+	hctl->accesses = hctl->collisions = 0;
+#endif
 }
 /*
@@ -342,6 +487,10 @@ choose_nelem_alloc(Size entrysize)
 	return nelem_alloc;
 }
+/*
+ * Compute derived fields of hctl and build the initial directory/segment
+ * arrays
+ */
 static bool
 init_htab(HTAB *hashp, long nelem)
 {
@@ -351,6 +500,12 @@ init_htab(HTAB *hashp, long nelem)
 	int			nbuckets;
 	int			nsegs;
+	/*
+	 * initialize mutex if it's a partitioned table
+	 */
+	if (IS_PARTITIONED(hctl))
+		SpinLockInit(&hctl->mutex);
 	/*
 	 * Divide number of elements by the fill factor to determine a desired
 	 * number of buckets.  Allocate space for the next greater power of two
@@ -360,6 +515,15 @@ init_htab(HTAB *hashp, long nelem)
 	nbuckets = 1 << my_log2(lnbuckets);
+	/*
+	 * In a partitioned table, nbuckets must be at least equal to
+	 * num_partitions; were it less, keys with apparently different partition
+	 * numbers would map to the same bucket, breaking partition independence.
+	 * (Normally nbuckets will be much bigger; this is just a safety check.)
+	 */
+	while (nbuckets < hctl->num_partitions)
+		nbuckets <<= 1;
 	hctl->max_bucket = hctl->low_mask = nbuckets - 1;
 	hctl->high_mask = (nbuckets << 1) - 1;
@@ -491,6 +655,19 @@ hash_select_dirsize(long num_entries)
 	return nDirEntries;
 }
+/*
+ * Compute the required initial memory allocation for a shared-memory
+ * hashtable with the given parameters.  We need space for the HASHHDR
+ * and for the (non expansible) directory.
+ */
+Size
+hash_get_shared_size(HASHCTL *info, int flags)
+{
+	Assert(flags & HASH_DIRSIZE);
+	Assert(info->dsize == info->max_dsize);
+	return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
+}
 /********************** DESTROY ROUTINES ************************/
@@ -521,7 +698,7 @@ hash_stats(const char *where, HTAB *hashp)
 			where, hashp->hctl->accesses, hashp->hctl->collisions);
 	fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
-			hashp->hctl->nentries, hashp->hctl->keysize,
+			hashp->hctl->nentries, (long) hashp->hctl->keysize,
 			hashp->hctl->max_bucket, hashp->hctl->nsegs);
 	fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
 			where, hash_accesses, hash_collisions);
@@ -533,6 +710,19 @@ hash_stats(const char *where, HTAB *hashp)
 /*******************************SEARCH ROUTINES *****************************/
+/*
+ * get_hash_value -- exported routine to calculate a key's hash value
+ *
+ * We export this because for partitioned tables, callers need to compute
+ * the partition number (from the low-order bits of the hash value) before
+ * searching.
+ */
+uint32
+get_hash_value(HTAB *hashp, const void *keyPtr)
+{
+	return hashp->hash(keyPtr, hashp->keysize);
+}
 /* Convert a hash value to a bucket number */
 static inline uint32
 calc_bucket(HASHHDR *hctl, uint32 hash_val)
@@ -546,8 +736,9 @@ calc_bucket(HASHHDR *hctl, uint32 hash_val)
 	return bucket;
 }
-/*----------
+/*
 * hash_search -- look up key in table and perform action
+ * hash_search_with_hash_value -- same, with key's hash value already computed
 *
 * action is one of:
 *		HASH_FIND: look up key in table
@@ -568,17 +759,32 @@ calc_bucket(HASHHDR *hctl, uint32 hash_val)
 * If foundPtr isn't NULL, then *foundPtr is set TRUE if we found an
 * existing entry in the table, FALSE otherwise.  This is needed in the
 * HASH_ENTER case, but is redundant with the return value otherwise.
- *----------
+ *
+ * For hash_search_with_hash_value, the hashvalue parameter must have been
+ * calculated with get_hash_value().
 */
 void *
 hash_search(HTAB *hashp,
 			const void *keyPtr,
 			HASHACTION action,
 			bool *foundPtr)
+{
+	return hash_search_with_hash_value(hashp,
+									   keyPtr,
+									   hashp->hash(keyPtr, hashp->keysize),
+									   action,
+									   foundPtr);
+}
+void *
+hash_search_with_hash_value(HTAB *hashp,
+							const void *keyPtr,
+							uint32 hashvalue,
+							HASHACTION action,
+							bool *foundPtr)
 {
 	HASHHDR    *hctl = hashp->hctl;
-	Size		keysize = hctl->keysize;
+	Size		keysize;
-	uint32		hashvalue;
 	uint32		bucket;
 	long		segment_num;
 	long		segment_ndx;
@@ -595,11 +801,10 @@ hash_search(HTAB *hashp,
 	/*
 	 * Do the initial lookup
 	 */
-	hashvalue = hashp->hash(keyPtr, keysize);
 	bucket = calc_bucket(hctl, hashvalue);
-	segment_num = bucket >> hctl->sshift;
+	segment_num = bucket >> hashp->sshift;
-	segment_ndx = MOD(bucket, hctl->ssize);
+	segment_ndx = MOD(bucket, hashp->ssize);
 	segp = hashp->dir[segment_num];
@@ -613,6 +818,7 @@ hash_search(HTAB *hashp,
 	 * Follow collision chain looking for matching key
 	 */
 	match = hashp->match;		/* save one fetch in inner loop */
+	keysize = hashp->keysize;	/* ditto */
 	while (currBucket != NULL)
 	{
@@ -643,15 +849,25 @@ hash_search(HTAB *hashp,
 		case HASH_REMOVE:
 			if (currBucket != NULL)
 			{
-				Assert(hctl->nentries > 0);
+				/* use volatile pointer to prevent code rearrangement */
-				hctl->nentries--;
+				volatile HASHHDR *hctlv = hctl;
+				/* if partitioned, must lock to touch nentries and freeList */
+				if (IS_PARTITIONED(hctlv))
+					SpinLockAcquire(&hctlv->mutex);
+				Assert(hctlv->nentries > 0);
+				hctlv->nentries--;
 				/* remove record from hash bucket's chain. */
 				*prevBucketPtr = currBucket->link;
 				/* add the record to the freelist for this table.  */
-				currBucket->link = hctl->freeList;
+				currBucket->link = hctlv->freeList;
-				hctl->freeList = currBucket;
+				hctlv->freeList = currBucket;
+				if (IS_PARTITIONED(hctlv))
+					SpinLockRelease(&hctlv->mutex);
 				/*
 				 * better hope the caller is synchronizing access to this
@@ -672,32 +888,23 @@ hash_search(HTAB *hashp,
 			if (currBucket != NULL)
 				return (void *) ELEMENTKEY(currBucket);
-			/* get the next free element */
+			currBucket = get_hash_entry(hashp);
-			currBucket = hctl->freeList;
 			if (currBucket == NULL)
 			{
-				/* no free elements.  allocate another chunk of buckets */
+				/* out of memory */
-				if (!element_alloc(hashp, hctl->nelem_alloc))
+				if (action == HASH_ENTER_NULL)
-				{
+					return NULL;
-					/* out of memory */
+				/* report a generic message */
-					if (action == HASH_ENTER_NULL)
+				if (hashp->isshared)
-						return NULL;
+					ereport(ERROR,
-					/* report a generic message */
+							(errcode(ERRCODE_OUT_OF_MEMORY),
-					if (hashp->isshared)
+							 errmsg("out of shared memory")));
-						ereport(ERROR,
+				else
-								(errcode(ERRCODE_OUT_OF_MEMORY),
+					ereport(ERROR,
-								 errmsg("out of shared memory")));
+							(errcode(ERRCODE_OUT_OF_MEMORY),
-					else
+							 errmsg("out of memory")));
-						ereport(ERROR,
-								(errcode(ERRCODE_OUT_OF_MEMORY),
-								 errmsg("out of memory")));
-				}
-				currBucket = hctl->freeList;
-				Assert(currBucket != NULL);
 			}
-			hctl->freeList = currBucket->link;
 			/* link into hashbucket chain */
 			*prevBucketPtr = currBucket;
 			currBucket->link = NULL;
@@ -708,8 +915,10 @@ hash_search(HTAB *hashp,
 			/* caller is expected to fill the data field on return */
-			/* Check if it is time to split the segment */
+			/* Check if it is time to split a bucket */
-			if (++hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor)
+			/* Can't split if running in partitioned mode */
+			if (!IS_PARTITIONED(hctl) &&
+				hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor)
 			{
 				/*
 				 * NOTE: failure to expand table is not a fatal error, it just
@@ -726,6 +935,61 @@ hash_search(HTAB *hashp,
 	return NULL;				/* keep compiler quiet */
 }
+/*
+ * create a new entry if possible
+ */
+static HASHBUCKET
+get_hash_entry(HTAB *hashp)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile HASHHDR *hctlv = hashp->hctl;
+	HASHBUCKET	newElement;
+	for (;;)
+	{
+		/* if partitioned, must lock to touch nentries and freeList */
+		if (IS_PARTITIONED(hctlv))
+			SpinLockAcquire(&hctlv->mutex);
+		/* try to get an entry from the freelist */
+		newElement = hctlv->freeList;
+		if (newElement != NULL)
+			break;
+		/* no free elements.  allocate another chunk of buckets */
+		if (IS_PARTITIONED(hctlv))
+			SpinLockRelease(&hctlv->mutex);
+		if (!element_alloc(hashp, hctlv->nelem_alloc))
+		{
+			/* out of memory */
+			return NULL;
+		}
+	}
+	/* remove entry from freelist, bump nentries */
+	hctlv->freeList = newElement->link;
+	hctlv->nentries++;
+	if (IS_PARTITIONED(hctlv))
+		SpinLockRelease(&hctlv->mutex);
+	return newElement;
+}
+/*
+ * hash_get_num_entries -- get the number of entries in a hashtable
+ */
+long
+hash_get_num_entries(HTAB *hashp)
+{
+	/*
+	 * We currently don't bother with the mutex; it's only sensible to call
+	 * this function if you've got lock on all partitions of the table.
+	 */
+	return hashp->hctl->nentries;
+}
 /*
 * hash_seq_init/_search
 *			Sequentially search through hash table and return
@@ -736,6 +1000,9 @@ hash_search(HTAB *hashp,
 * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
 * if elements are added to the table while the scan is in progress, it is
 * unspecified whether they will be visited by the scan or not.
+ *
+ * NOTE: to use this with a partitioned hashtable, caller had better hold
+ * at least shared lock on all partitions of the table throughout the scan!
 */
 void
 hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
@@ -773,7 +1040,7 @@ hash_seq_search(HASH_SEQ_STATUS *status)
 	curBucket = status->curBucket;
 	hashp = status->hashp;
 	hctl = hashp->hctl;
-	ssize = hctl->ssize;
+	ssize = hashp->ssize;
 	max_bucket = hctl->max_bucket;
 	if (curBucket > max_bucket)
@@ -782,7 +1049,7 @@ hash_seq_search(HASH_SEQ_STATUS *status)
 	/*
 	 * first find the right segment in the table directory.
 	 */
-	segment_num = curBucket >> hctl->sshift;
+	segment_num = curBucket >> hashp->sshift;
 	segment_ndx = MOD(curBucket, ssize);
 	segp = hashp->dir[segment_num];
@@ -840,13 +1107,15 @@ expand_table(HTAB *hashp)
 	HASHBUCKET	currElement,
 				nextElement;
+	Assert(!IS_PARTITIONED(hctl));
 #ifdef HASH_STATISTICS
 	hash_expansions++;
 #endif
 	new_bucket = hctl->max_bucket + 1;
-	new_segnum = new_bucket >> hctl->sshift;
+	new_segnum = new_bucket >> hashp->sshift;
-	new_segndx = MOD(new_bucket, hctl->ssize);
+	new_segndx = MOD(new_bucket, hashp->ssize);
 	if (new_segnum >= hctl->nsegs)
 	{
@@ -885,8 +1154,8 @@ expand_table(HTAB *hashp)
 	 * split at this point.  With a different way of reducing the hash value,
 	 * that might not be true!
 	 */
-	old_segnum = old_bucket >> hctl->sshift;
+	old_segnum = old_bucket >> hashp->sshift;
-	old_segndx = MOD(old_bucket, hctl->ssize);
+	old_segndx = MOD(old_bucket, hashp->ssize);
 	old_seg = hashp->dir[old_segnum];
 	new_seg = hashp->dir[new_segnum];
@@ -963,12 +1232,12 @@ seg_alloc(HTAB *hashp)
 	HASHSEGMENT segp;
 	CurrentDynaHashCxt = hashp->hcxt;
-	segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->hctl->ssize);
+	segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
 	if (!segp)
 		return NULL;
-	MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->hctl->ssize);
+	MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
 	return segp;
 }
@@ -979,29 +1248,44 @@ seg_alloc(HTAB *hashp)
 static bool
 element_alloc(HTAB *hashp, int nelem)
 {
-	HASHHDR    *hctl = hashp->hctl;
+	/* use volatile pointer to prevent code rearrangement */
+	volatile HASHHDR *hctlv = hashp->hctl;
 	Size		elementSize;
+	HASHELEMENT *firstElement;
 	HASHELEMENT *tmpElement;
+	HASHELEMENT *prevElement;
 	int			i;
 	/* Each element has a HASHELEMENT header plus user data. */
-	elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
+	elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctlv->entrysize);
 	CurrentDynaHashCxt = hashp->hcxt;
-	tmpElement = (HASHELEMENT *)
+	firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
-		hashp->alloc(nelem * elementSize);
-	if (!tmpElement)
+	if (!firstElement)
 		return false;
-	/* link all the new entries into the freelist */
+	/* prepare to link all the new entries into the freelist */
+	prevElement = NULL;
+	tmpElement = firstElement;
 	for (i = 0; i < nelem; i++)
 	{
-		tmpElement->link = hctl->freeList;
+		tmpElement->link = prevElement;
-		hctl->freeList = tmpElement;
+		prevElement = tmpElement;
 		tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
 	}
+	/* if partitioned, must lock to touch freeList */
+	if (IS_PARTITIONED(hctlv))
+		SpinLockAcquire(&hctlv->mutex);
+	/* freelist could be nonempty if two backends did this concurrently */
+	firstElement->link = hctlv->freeList;
+	hctlv->freeList = prevElement;
+	if (IS_PARTITIONED(hctlv))
+		SpinLockRelease(&hctlv->mutex);
 	return true;
 }

--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
 /*-------------------------------------------------------------------------
 *
 * hsearch.h
- *	  for hash tables, particularly hash tables in shared memory
+ *	  exported definitions for utils/hash/dynahash.c; see notes therein
 *
 *
 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/utils/hsearch.h,v 1.43 2006/06/25 18:29:49 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/hsearch.h,v 1.44 2006/07/22 23:04:39 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -43,27 +43,6 @@ typedef void *(*HashCopyFunc) (void *dest, const void *src, Size keysize);
 */
 typedef void *(*HashAllocFunc) (Size request);
-/*
- * Constants
- *
- * A hash table has a top-level "directory", each of whose entries points
- * to a "segment" of ssize bucket headers.	The maximum number of hash
- * buckets is thus dsize * ssize (but dsize may be expansible).  Of course,
- * the number of records in the table can be larger, but we don't want a
- * whole lot of records per bucket or performance goes down.
- *
- * In a hash table allocated in shared memory, the directory cannot be
- * expanded because it must stay at a fixed address.  The directory size
- * should be selected using hash_select_dirsize (and you'd better have
- * a good idea of the maximum number of entries!).	For non-shared hash
- * tables, the initial directory size can be left at the default.
- */
-#define DEF_SEGSIZE			   256
-#define DEF_SEGSIZE_SHIFT	   8	/* must be log2(DEF_SEGSIZE) */
-#define DEF_DIRSIZE			   256
-#define DEF_FFACTOR			   1	/* default fill factor */
 /*
 * HASHELEMENT is the private part of a hashtable entry.  The caller's data
 * follows the HASHELEMENT structure (on a MAXALIGN'd boundary).  The hash key
@@ -75,81 +54,42 @@ typedef struct HASHELEMENT
 	uint32		hashvalue;		/* hash function result for this entry */
 } HASHELEMENT;
-/* A hash bucket is a linked list of HASHELEMENTs */
+/* Hash table header struct is an opaque type known only within dynahash.c */
-typedef HASHELEMENT *HASHBUCKET;
+typedef struct HASHHDR HASHHDR;
-/* A hash segment is an array of bucket headers */
+/* Hash table control struct is an opaque type known only within dynahash.c */
-typedef HASHBUCKET *HASHSEGMENT;
+typedef struct HTAB HTAB;
-/* Header structure for a hash table --- contains all changeable info */
-typedef struct HASHHDR
-{
-	long		dsize;			/* Directory Size */
-	long		ssize;			/* Segment Size --- must be power of 2 */
-	int			sshift;			/* Segment shift = log2(ssize) */
-	uint32		max_bucket;		/* ID of Maximum bucket in use */
-	uint32		high_mask;		/* Mask to modulo into entire table */
-	uint32		low_mask;		/* Mask to modulo into lower half of table */
-	long		ffactor;		/* Fill factor */
-	long		nentries;		/* Number of entries in hash table */
-	long		nsegs;			/* Number of allocated segments */
-	Size		keysize;		/* hash key length in bytes */
-	Size		entrysize;		/* total user element size in bytes */
-	long		max_dsize;		/* 'dsize' limit if directory is fixed size */
-	int			nelem_alloc;	/* number of entries to allocate at once */
-	HASHELEMENT *freeList;		/* linked list of free elements */
-#ifdef HASH_STATISTICS
-	long		accesses;
-	long		collisions;
-#endif
-} HASHHDR;
-/*
- * Top control structure for a hashtable --- need not be shared, since
- * no fields change at runtime
- */
-typedef struct HTAB
-{
-	HASHHDR    *hctl;			/* shared control information */
-	HASHSEGMENT *dir;			/* directory of segment starts */
-	HashValueFunc hash;			/* hash function */
-	HashCompareFunc match;		/* key comparison function */
-	HashCopyFunc keycopy;		/* key copying function */
-	HashAllocFunc alloc;		/* memory allocator */
-	MemoryContext hcxt;			/* memory context if default allocator used */
-	char	   *tabname;		/* table name (for error messages) */
-	bool		isshared;		/* true if table is in shared memory */
-} HTAB;
 /* Parameter data structure for hash_create */
 /* Only those fields indicated by hash_flags need be set */
 typedef struct HASHCTL
 {
-	long		ssize;			/* Segment Size */
+	long		num_partitions;	/* # partitions (must be power of 2) */
-	long		dsize;			/* (initial) Directory Size */
+	long		ssize;			/* segment size */
-	long		max_dsize;		/* limit to dsize if directory size is limited */
+	long		dsize;			/* (initial) directory size */
-	long		ffactor;		/* Fill factor */
+	long		max_dsize;		/* limit to dsize if dir size is limited */
+	long		ffactor;		/* fill factor */
 	Size		keysize;		/* hash key length in bytes */
 	Size		entrysize;		/* total user element size in bytes */
 	HashValueFunc hash;			/* hash function */
 	HashCompareFunc match;		/* key comparison function */
 	HashCopyFunc keycopy;		/* key copying function */
 	HashAllocFunc alloc;		/* memory allocator */
-	HASHSEGMENT *dir;			/* directory of segment starts */
-	HASHHDR    *hctl;			/* location of header in shared mem */
 	MemoryContext hcxt;			/* memory context to use for allocations */
+	HASHHDR    *hctl;			/* location of header in shared mem */
 } HASHCTL;
 /* Flags to indicate which parameters are supplied */
+#define HASH_PARTITION	0x001	/* Hashtable is used w/partitioned locking */
 #define HASH_SEGMENT	0x002	/* Set segment size */
-#define HASH_DIRSIZE	0x004	/* Set directory size */
+#define HASH_DIRSIZE	0x004	/* Set directory size (initial and max) */
 #define HASH_FFACTOR	0x008	/* Set fill factor */
 #define HASH_FUNCTION	0x010	/* Set user defined hash function */
-#define HASH_ELEM		0x020	/* Set key/entry size */
+#define HASH_ELEM		0x020	/* Set keysize and entrysize */
 #define HASH_SHARED_MEM 0x040	/* Hashtable is in shared memory */
 #define HASH_ATTACH		0x080	/* Do not initialize hctl */
 #define HASH_ALLOC		0x100	/* Set memory allocator */
-#define HASH_CONTEXT	0x200	/* Set explicit memory context */
+#define HASH_CONTEXT	0x200	/* Set memory allocation context */
 #define HASH_COMPARE	0x400	/* Set user defined comparison function */
 #define HASH_KEYCOPY	0x800	/* Set user defined key-copying function */
@@ -183,10 +123,16 @@ extern void hash_destroy(HTAB *hashp);
 extern void hash_stats(const char *where, HTAB *hashp);
 extern void *hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action,
 			bool *foundPtr);
+extern uint32 get_hash_value(HTAB *hashp, const void *keyPtr);
+extern void *hash_search_with_hash_value(HTAB *hashp, const void *keyPtr,
+			uint32 hashvalue, HASHACTION action,
+			bool *foundPtr);
+extern long hash_get_num_entries(HTAB *hashp);
 extern void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp);
 extern void *hash_seq_search(HASH_SEQ_STATUS *status);
 extern Size hash_estimate_size(long num_entries, Size entrysize);
 extern long hash_select_dirsize(long num_entries);
+extern Size hash_get_shared_size(HASHCTL *info, int flags);
 /*
 * prototypes for functions in hashfn.c