Commit 51ee9fa1 authored by Tom Lane's avatar Tom Lane

Add support to dynahash.c for partitioning shared hashtables according

to the low-order bits of the entry hash value.  Also make some incidental
cleanups in the dynahash API, such as not exporting the hash header
structs to the world.
parent c0e9b313
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/ipc/shmem.c,v 1.93 2006/07/14 14:52:22 momjian Exp $ * $PostgreSQL: pgsql/src/backend/storage/ipc/shmem.c,v 1.94 2006/07/22 23:04:39 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -211,9 +211,6 @@ InitShmemIndex(void) ...@@ -211,9 +211,6 @@ InitShmemIndex(void)
{ {
HASHCTL info; HASHCTL info;
int hash_flags; int hash_flags;
ShmemIndexEnt *result,
item;
bool found;
/* /*
* Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex
...@@ -227,32 +224,11 @@ InitShmemIndex(void) ...@@ -227,32 +224,11 @@ InitShmemIndex(void)
info.entrysize = sizeof(ShmemIndexEnt); info.entrysize = sizeof(ShmemIndexEnt);
hash_flags = HASH_ELEM; hash_flags = HASH_ELEM;
/* This will acquire the shmem index lock, but not release it. */
ShmemIndex = ShmemInitHash("ShmemIndex", ShmemIndex = ShmemInitHash("ShmemIndex",
SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
&info, hash_flags); &info, hash_flags);
if (!ShmemIndex) if (!ShmemIndex)
elog(FATAL, "could not initialize Shmem Index"); elog(FATAL, "could not initialize Shmem Index");
/*
* Now, create an entry in the hashtable for the index itself.
*/
if (!IsUnderPostmaster)
{
MemSet(item.key, 0, SHMEM_INDEX_KEYSIZE);
strncpy(item.key, "ShmemIndex", SHMEM_INDEX_KEYSIZE);
result = (ShmemIndexEnt *)
hash_search(ShmemIndex, (void *) &item, HASH_ENTER, &found);
Assert(!found);
result->location = MAKE_OFFSET(ShmemIndex->hctl);
result->size = SHMEM_INDEX_SIZE;
}
/* now release the lock acquired in ShmemInitStruct */
LWLockRelease(ShmemIndexLock);
} }
/* /*
...@@ -295,7 +271,7 @@ ShmemInitHash(const char *name, /* table string name for shmem index */ ...@@ -295,7 +271,7 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
/* look it up in the shmem index */ /* look it up in the shmem index */
location = ShmemInitStruct(name, location = ShmemInitStruct(name,
sizeof(HASHHDR) + infoP->dsize * sizeof(HASHSEGMENT), hash_get_shared_size(infoP, hash_flags),
&found); &found);
/* /*
...@@ -312,9 +288,8 @@ ShmemInitHash(const char *name, /* table string name for shmem index */ ...@@ -312,9 +288,8 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
if (found) if (found)
hash_flags |= HASH_ATTACH; hash_flags |= HASH_ATTACH;
/* Now provide the header and directory pointers */ /* Pass location of hashtable header to hash_create */
infoP->hctl = (HASHHDR *) location; infoP->hctl = (HASHHDR *) location;
infoP->dir = (HASHSEGMENT *) (((char *) location) + sizeof(HASHHDR));
return hash_create(name, init_size, infoP, hash_flags); return hash_create(name, init_size, infoP, hash_flags);
} }
...@@ -363,14 +338,16 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) ...@@ -363,14 +338,16 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
* If the shmem index doesn't exist, we are bootstrapping: we must * If the shmem index doesn't exist, we are bootstrapping: we must
* be trying to init the shmem index itself. * be trying to init the shmem index itself.
* *
* Notice that the ShmemIndexLock is held until the shmem index * Notice that the ShmemIndexLock is released before the shmem
* has been completely initialized. * index has been initialized. This should be OK because no
* other process can be accessing shared memory yet.
*/ */
Assert(shmemseghdr->indexoffset == 0); Assert(shmemseghdr->indexoffset == 0);
structPtr = ShmemAlloc(size); structPtr = ShmemAlloc(size);
shmemseghdr->indexoffset = MAKE_OFFSET(structPtr); shmemseghdr->indexoffset = MAKE_OFFSET(structPtr);
*foundPtr = FALSE; *foundPtr = FALSE;
} }
LWLockRelease(ShmemIndexLock);
return structPtr; return structPtr;
} }
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.166 2006/07/14 14:52:23 momjian Exp $ * $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.167 2006/07/22 23:04:39 tgl Exp $
* *
* NOTES * NOTES
* A lock table is a shared memory hash table. When * A lock table is a shared memory hash table. When
...@@ -1958,7 +1958,7 @@ GetLockStatusData(void) ...@@ -1958,7 +1958,7 @@ GetLockStatusData(void)
{ {
LWLockAcquire(FirstLockMgrLock + i, LW_SHARED); LWLockAcquire(FirstLockMgrLock + i, LW_SHARED);
proclockTable = LockMethodProcLockHash[i]; proclockTable = LockMethodProcLockHash[i];
els += proclockTable->hctl->nentries; els += hash_get_num_entries(proclockTable);
} }
data->nelements = els; data->nelements = els;
......
...@@ -3,17 +3,36 @@ ...@@ -3,17 +3,36 @@
* dynahash.c * dynahash.c
* dynamic hash tables * dynamic hash tables
* *
* dynahash.c supports both local-to-a-backend hash tables and hash tables in
* shared memory. For shared hash tables, it is the caller's responsibility
* to provide appropriate access interlocking. The simplest convention is
* that a single LWLock protects the whole hash table. Searches (HASH_FIND or
* hash_seq_search) need only shared lock, but any update requires exclusive
* lock. For heavily-used shared tables, the single-lock approach creates a
* concurrency bottleneck, so we also support "partitioned" locking wherein
* there are multiple LWLocks guarding distinct subsets of the table. To use
* a hash table in partitioned mode, the HASH_PARTITION flag must be given
* to hash_create. This prevents any attempt to split buckets on-the-fly.
* Therefore, each hash bucket chain operates independently, and no fields
* of the hash header change after init except nentries and freeList.
* A partitioned table uses a spinlock to guard changes of those two fields.
* This lets any subset of the hash buckets be treated as a separately
* lockable partition. We expect callers to use the low-order bits of a
* lookup key's hash value as a partition number --- this will work because
* of the way calc_bucket() maps hash values to bucket numbers.
* *
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.69 2006/07/14 14:52:25 momjian Exp $ * $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.70 2006/07/22 23:04:39 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
/* /*
* Original comments:
* *
* Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson. * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
* Coded into C, with minor code improvements, and with hsearch(3) interface, * Coded into C, with minor code improvements, and with hsearch(3) interface,
...@@ -45,9 +64,107 @@ ...@@ -45,9 +64,107 @@
#include "postgres.h" #include "postgres.h"
#include "storage/shmem.h" #include "storage/shmem.h"
#include "storage/spin.h"
#include "utils/dynahash.h" #include "utils/dynahash.h"
#include "utils/memutils.h" #include "utils/memutils.h"
/*
* Constants
*
* A hash table has a top-level "directory", each of whose entries points
* to a "segment" of ssize bucket headers. The maximum number of hash
* buckets is thus dsize * ssize (but dsize may be expansible). Of course,
* the number of records in the table can be larger, but we don't want a
* whole lot of records per bucket or performance goes down.
*
* In a hash table allocated in shared memory, the directory cannot be
* expanded because it must stay at a fixed address. The directory size
* should be selected using hash_select_dirsize (and you'd better have
* a good idea of the maximum number of entries!). For non-shared hash
* tables, the initial directory size can be left at the default.
*/
#define DEF_SEGSIZE 256
#define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */
#define DEF_DIRSIZE 256
#define DEF_FFACTOR 1 /* default fill factor */
/* A hash bucket is a linked list of HASHELEMENTs */
typedef HASHELEMENT *HASHBUCKET;
/* A hash segment is an array of bucket headers */
typedef HASHBUCKET *HASHSEGMENT;
/*
* Header structure for a hash table --- contains all changeable info
*
* In a shared-memory hash table, the HASHHDR is in shared memory, while
* each backend has a local HTAB struct. For a non-shared table, there isn't
* any functional difference between HASHHDR and HTAB, but we separate them
* anyway to share code between shared and non-shared tables.
*/
struct HASHHDR
{
/* In a partitioned table, take this lock to touch nentries or freeList */
slock_t mutex; /* unused if not partitioned table */
/* These fields change during entry addition/deletion */
long nentries; /* number of entries in hash table */
HASHELEMENT *freeList; /* linked list of free elements */
/* These fields can change, but not in a partitioned table */
/* Also, dsize can't change in a shared table, even if unpartitioned */
long dsize; /* directory size */
long nsegs; /* number of allocated segments (<= dsize) */
uint32 max_bucket; /* ID of maximum bucket in use */
uint32 high_mask; /* mask to modulo into entire table */
uint32 low_mask; /* mask to modulo into lower half of table */
/* These fields are fixed at hashtable creation */
Size keysize; /* hash key length in bytes */
Size entrysize; /* total user element size in bytes */
long num_partitions; /* # partitions (must be power of 2), or 0 */
long ffactor; /* target fill factor */
long max_dsize; /* 'dsize' limit if directory is fixed size */
long ssize; /* segment size --- must be power of 2 */
int sshift; /* segment shift = log2(ssize) */
int nelem_alloc; /* number of entries to allocate at once */
#ifdef HASH_STATISTICS
/*
* Count statistics here. NB: stats code doesn't bother with mutex,
* so counts could be corrupted a bit in a partitioned table.
*/
long accesses;
long collisions;
#endif
};
#define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
/*
* Top control structure for a hashtable --- in a shared table, each backend
* has its own copy (OK since no fields change at runtime)
*/
struct HTAB
{
HASHHDR *hctl; /* => shared control information */
HASHSEGMENT *dir; /* directory of segment starts */
HashValueFunc hash; /* hash function */
HashCompareFunc match; /* key comparison function */
HashCopyFunc keycopy; /* key copying function */
HashAllocFunc alloc; /* memory allocator */
MemoryContext hcxt; /* memory context if default allocator used */
char *tabname; /* table name (for error messages) */
bool isshared; /* true if table is in shared memory */
/* We keep local copies of these fixed values to reduce contention */
Size keysize; /* hash key length in bytes */
long ssize; /* segment size --- must be power of 2 */
int sshift; /* segment shift = log2(ssize) */
};
/* /*
* Key (also entry) part of a HASHELEMENT * Key (also entry) part of a HASHELEMENT
*/ */
...@@ -58,6 +175,12 @@ ...@@ -58,6 +175,12 @@
*/ */
#define MOD(x,y) ((x) & ((y)-1)) #define MOD(x,y) ((x) & ((y)-1))
#if HASH_STATISTICS
static long hash_accesses,
hash_collisions,
hash_expansions;
#endif
/* /*
* Private function prototypes * Private function prototypes
*/ */
...@@ -66,6 +189,7 @@ static HASHSEGMENT seg_alloc(HTAB *hashp); ...@@ -66,6 +189,7 @@ static HASHSEGMENT seg_alloc(HTAB *hashp);
static bool element_alloc(HTAB *hashp, int nelem); static bool element_alloc(HTAB *hashp, int nelem);
static bool dir_realloc(HTAB *hashp); static bool dir_realloc(HTAB *hashp);
static bool expand_table(HTAB *hashp); static bool expand_table(HTAB *hashp);
static HASHBUCKET get_hash_entry(HTAB *hashp);
static void hdefault(HTAB *hashp); static void hdefault(HTAB *hashp);
static int choose_nelem_alloc(Size entrysize); static int choose_nelem_alloc(Size entrysize);
static bool init_htab(HTAB *hashp, long nelem); static bool init_htab(HTAB *hashp, long nelem);
...@@ -85,13 +209,6 @@ DynaHashAlloc(Size size) ...@@ -85,13 +209,6 @@ DynaHashAlloc(Size size)
} }
#if HASH_STATISTICS
static long hash_accesses,
hash_collisions,
hash_expansions;
#endif
/************************** CREATE ROUTINES **********************/ /************************** CREATE ROUTINES **********************/
/* /*
...@@ -185,17 +302,26 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) ...@@ -185,17 +302,26 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
if (flags & HASH_SHARED_MEM) if (flags & HASH_SHARED_MEM)
{ {
/* /*
* ctl structure is preallocated for shared memory tables. Note that * ctl structure and directory are preallocated for shared memory
* HASH_DIRSIZE and HASH_ALLOC had better be set as well. * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set
* as well.
*/ */
hashp->hctl = info->hctl; hashp->hctl = info->hctl;
hashp->dir = info->dir; hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
hashp->hcxt = NULL; hashp->hcxt = NULL;
hashp->isshared = true; hashp->isshared = true;
/* hash table already exists, we're just attaching to it */ /* hash table already exists, we're just attaching to it */
if (flags & HASH_ATTACH) if (flags & HASH_ATTACH)
{
/* make local copies of some heavily-used values */
hctl = hashp->hctl;
hashp->keysize = hctl->keysize;
hashp->ssize = hctl->ssize;
hashp->sshift = hctl->sshift;
return hashp; return hashp;
}
} }
else else
{ {
...@@ -218,9 +344,16 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) ...@@ -218,9 +344,16 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
hdefault(hashp); hdefault(hashp);
hctl = hashp->hctl; hctl = hashp->hctl;
#ifdef HASH_STATISTICS
hctl->accesses = hctl->collisions = 0; if (flags & HASH_PARTITION)
#endif {
/* Doesn't make sense to partition a local hash table */
Assert(flags & HASH_SHARED_MEM);
/* # of partitions had better be a power of 2 */
Assert(info->num_partitions == (1L << my_log2(info->num_partitions)));
hctl->num_partitions = info->num_partitions;
}
if (flags & HASH_SEGMENT) if (flags & HASH_SEGMENT)
{ {
...@@ -252,6 +385,11 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) ...@@ -252,6 +385,11 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
hctl->entrysize = info->entrysize; hctl->entrysize = info->entrysize;
} }
/* make local copies of heavily-used constant fields */
hashp->keysize = hctl->keysize;
hashp->ssize = hctl->ssize;
hashp->sshift = hctl->sshift;
/* Build the hash directory structure */ /* Build the hash directory structure */
if (!init_htab(hashp, nelem)) if (!init_htab(hashp, nelem))
{ {
...@@ -292,22 +430,29 @@ hdefault(HTAB *hashp) ...@@ -292,22 +430,29 @@ hdefault(HTAB *hashp)
MemSet(hctl, 0, sizeof(HASHHDR)); MemSet(hctl, 0, sizeof(HASHHDR));
hctl->ssize = DEF_SEGSIZE;
hctl->sshift = DEF_SEGSIZE_SHIFT;
hctl->dsize = DEF_DIRSIZE;
hctl->ffactor = DEF_FFACTOR;
hctl->nentries = 0; hctl->nentries = 0;
hctl->freeList = NULL;
hctl->dsize = DEF_DIRSIZE;
hctl->nsegs = 0; hctl->nsegs = 0;
/* rather pointless defaults for key & entry size */ /* rather pointless defaults for key & entry size */
hctl->keysize = sizeof(char *); hctl->keysize = sizeof(char *);
hctl->entrysize = 2 * sizeof(char *); hctl->entrysize = 2 * sizeof(char *);
hctl->num_partitions = 0; /* not partitioned */
hctl->ffactor = DEF_FFACTOR;
/* table has no fixed maximum size */ /* table has no fixed maximum size */
hctl->max_dsize = NO_MAX_DSIZE; hctl->max_dsize = NO_MAX_DSIZE;
/* garbage collection for HASH_REMOVE */ hctl->ssize = DEF_SEGSIZE;
hctl->freeList = NULL; hctl->sshift = DEF_SEGSIZE_SHIFT;
#ifdef HASH_STATISTICS
hctl->accesses = hctl->collisions = 0;
#endif
} }
/* /*
...@@ -342,6 +487,10 @@ choose_nelem_alloc(Size entrysize) ...@@ -342,6 +487,10 @@ choose_nelem_alloc(Size entrysize)
return nelem_alloc; return nelem_alloc;
} }
/*
* Compute derived fields of hctl and build the initial directory/segment
* arrays
*/
static bool static bool
init_htab(HTAB *hashp, long nelem) init_htab(HTAB *hashp, long nelem)
{ {
...@@ -351,6 +500,12 @@ init_htab(HTAB *hashp, long nelem) ...@@ -351,6 +500,12 @@ init_htab(HTAB *hashp, long nelem)
int nbuckets; int nbuckets;
int nsegs; int nsegs;
/*
* initialize mutex if it's a partitioned table
*/
if (IS_PARTITIONED(hctl))
SpinLockInit(&hctl->mutex);
/* /*
* Divide number of elements by the fill factor to determine a desired * Divide number of elements by the fill factor to determine a desired
* number of buckets. Allocate space for the next greater power of two * number of buckets. Allocate space for the next greater power of two
...@@ -360,6 +515,15 @@ init_htab(HTAB *hashp, long nelem) ...@@ -360,6 +515,15 @@ init_htab(HTAB *hashp, long nelem)
nbuckets = 1 << my_log2(lnbuckets); nbuckets = 1 << my_log2(lnbuckets);
/*
* In a partitioned table, nbuckets must be at least equal to
* num_partitions; were it less, keys with apparently different partition
* numbers would map to the same bucket, breaking partition independence.
* (Normally nbuckets will be much bigger; this is just a safety check.)
*/
while (nbuckets < hctl->num_partitions)
nbuckets <<= 1;
hctl->max_bucket = hctl->low_mask = nbuckets - 1; hctl->max_bucket = hctl->low_mask = nbuckets - 1;
hctl->high_mask = (nbuckets << 1) - 1; hctl->high_mask = (nbuckets << 1) - 1;
...@@ -491,6 +655,19 @@ hash_select_dirsize(long num_entries) ...@@ -491,6 +655,19 @@ hash_select_dirsize(long num_entries)
return nDirEntries; return nDirEntries;
} }
/*
* Compute the required initial memory allocation for a shared-memory
* hashtable with the given parameters. We need space for the HASHHDR
* and for the (non expansible) directory.
*/
Size
hash_get_shared_size(HASHCTL *info, int flags)
{
Assert(flags & HASH_DIRSIZE);
Assert(info->dsize == info->max_dsize);
return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
}
/********************** DESTROY ROUTINES ************************/ /********************** DESTROY ROUTINES ************************/
...@@ -521,7 +698,7 @@ hash_stats(const char *where, HTAB *hashp) ...@@ -521,7 +698,7 @@ hash_stats(const char *where, HTAB *hashp)
where, hashp->hctl->accesses, hashp->hctl->collisions); where, hashp->hctl->accesses, hashp->hctl->collisions);
fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n", fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
hashp->hctl->nentries, hashp->hctl->keysize, hashp->hctl->nentries, (long) hashp->hctl->keysize,
hashp->hctl->max_bucket, hashp->hctl->nsegs); hashp->hctl->max_bucket, hashp->hctl->nsegs);
fprintf(stderr, "%s: total accesses %ld total collisions %ld\n", fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
where, hash_accesses, hash_collisions); where, hash_accesses, hash_collisions);
...@@ -533,6 +710,19 @@ hash_stats(const char *where, HTAB *hashp) ...@@ -533,6 +710,19 @@ hash_stats(const char *where, HTAB *hashp)
/*******************************SEARCH ROUTINES *****************************/ /*******************************SEARCH ROUTINES *****************************/
/*
* get_hash_value -- exported routine to calculate a key's hash value
*
* We export this because for partitioned tables, callers need to compute
* the partition number (from the low-order bits of the hash value) before
* searching.
*/
uint32
get_hash_value(HTAB *hashp, const void *keyPtr)
{
return hashp->hash(keyPtr, hashp->keysize);
}
/* Convert a hash value to a bucket number */ /* Convert a hash value to a bucket number */
static inline uint32 static inline uint32
calc_bucket(HASHHDR *hctl, uint32 hash_val) calc_bucket(HASHHDR *hctl, uint32 hash_val)
...@@ -546,8 +736,9 @@ calc_bucket(HASHHDR *hctl, uint32 hash_val) ...@@ -546,8 +736,9 @@ calc_bucket(HASHHDR *hctl, uint32 hash_val)
return bucket; return bucket;
} }
/*---------- /*
* hash_search -- look up key in table and perform action * hash_search -- look up key in table and perform action
* hash_search_with_hash_value -- same, with key's hash value already computed
* *
* action is one of: * action is one of:
* HASH_FIND: look up key in table * HASH_FIND: look up key in table
...@@ -568,17 +759,32 @@ calc_bucket(HASHHDR *hctl, uint32 hash_val) ...@@ -568,17 +759,32 @@ calc_bucket(HASHHDR *hctl, uint32 hash_val)
* If foundPtr isn't NULL, then *foundPtr is set TRUE if we found an * If foundPtr isn't NULL, then *foundPtr is set TRUE if we found an
* existing entry in the table, FALSE otherwise. This is needed in the * existing entry in the table, FALSE otherwise. This is needed in the
* HASH_ENTER case, but is redundant with the return value otherwise. * HASH_ENTER case, but is redundant with the return value otherwise.
*---------- *
* For hash_search_with_hash_value, the hashvalue parameter must have been
* calculated with get_hash_value().
*/ */
void * void *
hash_search(HTAB *hashp, hash_search(HTAB *hashp,
const void *keyPtr, const void *keyPtr,
HASHACTION action, HASHACTION action,
bool *foundPtr) bool *foundPtr)
{
return hash_search_with_hash_value(hashp,
keyPtr,
hashp->hash(keyPtr, hashp->keysize),
action,
foundPtr);
}
void *
hash_search_with_hash_value(HTAB *hashp,
const void *keyPtr,
uint32 hashvalue,
HASHACTION action,
bool *foundPtr)
{ {
HASHHDR *hctl = hashp->hctl; HASHHDR *hctl = hashp->hctl;
Size keysize = hctl->keysize; Size keysize;
uint32 hashvalue;
uint32 bucket; uint32 bucket;
long segment_num; long segment_num;
long segment_ndx; long segment_ndx;
...@@ -595,11 +801,10 @@ hash_search(HTAB *hashp, ...@@ -595,11 +801,10 @@ hash_search(HTAB *hashp,
/* /*
* Do the initial lookup * Do the initial lookup
*/ */
hashvalue = hashp->hash(keyPtr, keysize);
bucket = calc_bucket(hctl, hashvalue); bucket = calc_bucket(hctl, hashvalue);
segment_num = bucket >> hctl->sshift; segment_num = bucket >> hashp->sshift;
segment_ndx = MOD(bucket, hctl->ssize); segment_ndx = MOD(bucket, hashp->ssize);
segp = hashp->dir[segment_num]; segp = hashp->dir[segment_num];
...@@ -613,6 +818,7 @@ hash_search(HTAB *hashp, ...@@ -613,6 +818,7 @@ hash_search(HTAB *hashp,
* Follow collision chain looking for matching key * Follow collision chain looking for matching key
*/ */
match = hashp->match; /* save one fetch in inner loop */ match = hashp->match; /* save one fetch in inner loop */
keysize = hashp->keysize; /* ditto */
while (currBucket != NULL) while (currBucket != NULL)
{ {
...@@ -643,15 +849,25 @@ hash_search(HTAB *hashp, ...@@ -643,15 +849,25 @@ hash_search(HTAB *hashp,
case HASH_REMOVE: case HASH_REMOVE:
if (currBucket != NULL) if (currBucket != NULL)
{ {
Assert(hctl->nentries > 0); /* use volatile pointer to prevent code rearrangement */
hctl->nentries--; volatile HASHHDR *hctlv = hctl;
/* if partitioned, must lock to touch nentries and freeList */
if (IS_PARTITIONED(hctlv))
SpinLockAcquire(&hctlv->mutex);
Assert(hctlv->nentries > 0);
hctlv->nentries--;
/* remove record from hash bucket's chain. */ /* remove record from hash bucket's chain. */
*prevBucketPtr = currBucket->link; *prevBucketPtr = currBucket->link;
/* add the record to the freelist for this table. */ /* add the record to the freelist for this table. */
currBucket->link = hctl->freeList; currBucket->link = hctlv->freeList;
hctl->freeList = currBucket; hctlv->freeList = currBucket;
if (IS_PARTITIONED(hctlv))
SpinLockRelease(&hctlv->mutex);
/* /*
* better hope the caller is synchronizing access to this * better hope the caller is synchronizing access to this
...@@ -672,32 +888,23 @@ hash_search(HTAB *hashp, ...@@ -672,32 +888,23 @@ hash_search(HTAB *hashp,
if (currBucket != NULL) if (currBucket != NULL)
return (void *) ELEMENTKEY(currBucket); return (void *) ELEMENTKEY(currBucket);
/* get the next free element */ currBucket = get_hash_entry(hashp);
currBucket = hctl->freeList;
if (currBucket == NULL) if (currBucket == NULL)
{ {
/* no free elements. allocate another chunk of buckets */ /* out of memory */
if (!element_alloc(hashp, hctl->nelem_alloc)) if (action == HASH_ENTER_NULL)
{ return NULL;
/* out of memory */ /* report a generic message */
if (action == HASH_ENTER_NULL) if (hashp->isshared)
return NULL; ereport(ERROR,
/* report a generic message */ (errcode(ERRCODE_OUT_OF_MEMORY),
if (hashp->isshared) errmsg("out of shared memory")));
ereport(ERROR, else
(errcode(ERRCODE_OUT_OF_MEMORY), ereport(ERROR,
errmsg("out of shared memory"))); (errcode(ERRCODE_OUT_OF_MEMORY),
else errmsg("out of memory")));
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
currBucket = hctl->freeList;
Assert(currBucket != NULL);
} }
hctl->freeList = currBucket->link;
/* link into hashbucket chain */ /* link into hashbucket chain */
*prevBucketPtr = currBucket; *prevBucketPtr = currBucket;
currBucket->link = NULL; currBucket->link = NULL;
...@@ -708,8 +915,10 @@ hash_search(HTAB *hashp, ...@@ -708,8 +915,10 @@ hash_search(HTAB *hashp,
/* caller is expected to fill the data field on return */ /* caller is expected to fill the data field on return */
/* Check if it is time to split the segment */ /* Check if it is time to split a bucket */
if (++hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor) /* Can't split if running in partitioned mode */
if (!IS_PARTITIONED(hctl) &&
hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor)
{ {
/* /*
* NOTE: failure to expand table is not a fatal error, it just * NOTE: failure to expand table is not a fatal error, it just
...@@ -726,6 +935,61 @@ hash_search(HTAB *hashp, ...@@ -726,6 +935,61 @@ hash_search(HTAB *hashp,
return NULL; /* keep compiler quiet */ return NULL; /* keep compiler quiet */
} }
/*
* create a new entry if possible
*/
static HASHBUCKET
get_hash_entry(HTAB *hashp)
{
/* use volatile pointer to prevent code rearrangement */
volatile HASHHDR *hctlv = hashp->hctl;
HASHBUCKET newElement;
for (;;)
{
/* if partitioned, must lock to touch nentries and freeList */
if (IS_PARTITIONED(hctlv))
SpinLockAcquire(&hctlv->mutex);
/* try to get an entry from the freelist */
newElement = hctlv->freeList;
if (newElement != NULL)
break;
/* no free elements. allocate another chunk of buckets */
if (IS_PARTITIONED(hctlv))
SpinLockRelease(&hctlv->mutex);
if (!element_alloc(hashp, hctlv->nelem_alloc))
{
/* out of memory */
return NULL;
}
}
/* remove entry from freelist, bump nentries */
hctlv->freeList = newElement->link;
hctlv->nentries++;
if (IS_PARTITIONED(hctlv))
SpinLockRelease(&hctlv->mutex);
return newElement;
}
/*
* hash_get_num_entries -- get the number of entries in a hashtable
*/
long
hash_get_num_entries(HTAB *hashp)
{
/*
* We currently don't bother with the mutex; it's only sensible to call
* this function if you've got lock on all partitions of the table.
*/
return hashp->hctl->nentries;
}
/* /*
* hash_seq_init/_search * hash_seq_init/_search
* Sequentially search through hash table and return * Sequentially search through hash table and return
...@@ -736,6 +1000,9 @@ hash_search(HTAB *hashp, ...@@ -736,6 +1000,9 @@ hash_search(HTAB *hashp,
* UNDEFINED (it might be the one that curIndex is pointing at!). Also, * UNDEFINED (it might be the one that curIndex is pointing at!). Also,
* if elements are added to the table while the scan is in progress, it is * if elements are added to the table while the scan is in progress, it is
* unspecified whether they will be visited by the scan or not. * unspecified whether they will be visited by the scan or not.
*
* NOTE: to use this with a partitioned hashtable, caller had better hold
* at least shared lock on all partitions of the table throughout the scan!
*/ */
void void
hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp) hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
...@@ -773,7 +1040,7 @@ hash_seq_search(HASH_SEQ_STATUS *status) ...@@ -773,7 +1040,7 @@ hash_seq_search(HASH_SEQ_STATUS *status)
curBucket = status->curBucket; curBucket = status->curBucket;
hashp = status->hashp; hashp = status->hashp;
hctl = hashp->hctl; hctl = hashp->hctl;
ssize = hctl->ssize; ssize = hashp->ssize;
max_bucket = hctl->max_bucket; max_bucket = hctl->max_bucket;
if (curBucket > max_bucket) if (curBucket > max_bucket)
...@@ -782,7 +1049,7 @@ hash_seq_search(HASH_SEQ_STATUS *status) ...@@ -782,7 +1049,7 @@ hash_seq_search(HASH_SEQ_STATUS *status)
/* /*
* first find the right segment in the table directory. * first find the right segment in the table directory.
*/ */
segment_num = curBucket >> hctl->sshift; segment_num = curBucket >> hashp->sshift;
segment_ndx = MOD(curBucket, ssize); segment_ndx = MOD(curBucket, ssize);
segp = hashp->dir[segment_num]; segp = hashp->dir[segment_num];
...@@ -840,13 +1107,15 @@ expand_table(HTAB *hashp) ...@@ -840,13 +1107,15 @@ expand_table(HTAB *hashp)
HASHBUCKET currElement, HASHBUCKET currElement,
nextElement; nextElement;
Assert(!IS_PARTITIONED(hctl));
#ifdef HASH_STATISTICS #ifdef HASH_STATISTICS
hash_expansions++; hash_expansions++;
#endif #endif
new_bucket = hctl->max_bucket + 1; new_bucket = hctl->max_bucket + 1;
new_segnum = new_bucket >> hctl->sshift; new_segnum = new_bucket >> hashp->sshift;
new_segndx = MOD(new_bucket, hctl->ssize); new_segndx = MOD(new_bucket, hashp->ssize);
if (new_segnum >= hctl->nsegs) if (new_segnum >= hctl->nsegs)
{ {
...@@ -885,8 +1154,8 @@ expand_table(HTAB *hashp) ...@@ -885,8 +1154,8 @@ expand_table(HTAB *hashp)
* split at this point. With a different way of reducing the hash value, * split at this point. With a different way of reducing the hash value,
* that might not be true! * that might not be true!
*/ */
old_segnum = old_bucket >> hctl->sshift; old_segnum = old_bucket >> hashp->sshift;
old_segndx = MOD(old_bucket, hctl->ssize); old_segndx = MOD(old_bucket, hashp->ssize);
old_seg = hashp->dir[old_segnum]; old_seg = hashp->dir[old_segnum];
new_seg = hashp->dir[new_segnum]; new_seg = hashp->dir[new_segnum];
...@@ -963,12 +1232,12 @@ seg_alloc(HTAB *hashp) ...@@ -963,12 +1232,12 @@ seg_alloc(HTAB *hashp)
HASHSEGMENT segp; HASHSEGMENT segp;
CurrentDynaHashCxt = hashp->hcxt; CurrentDynaHashCxt = hashp->hcxt;
segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->hctl->ssize); segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
if (!segp) if (!segp)
return NULL; return NULL;
MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->hctl->ssize); MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
return segp; return segp;
} }
...@@ -979,29 +1248,44 @@ seg_alloc(HTAB *hashp) ...@@ -979,29 +1248,44 @@ seg_alloc(HTAB *hashp)
static bool static bool
element_alloc(HTAB *hashp, int nelem) element_alloc(HTAB *hashp, int nelem)
{ {
HASHHDR *hctl = hashp->hctl; /* use volatile pointer to prevent code rearrangement */
volatile HASHHDR *hctlv = hashp->hctl;
Size elementSize; Size elementSize;
HASHELEMENT *firstElement;
HASHELEMENT *tmpElement; HASHELEMENT *tmpElement;
HASHELEMENT *prevElement;
int i; int i;
/* Each element has a HASHELEMENT header plus user data. */ /* Each element has a HASHELEMENT header plus user data. */
elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize); elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctlv->entrysize);
CurrentDynaHashCxt = hashp->hcxt; CurrentDynaHashCxt = hashp->hcxt;
tmpElement = (HASHELEMENT *) firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
hashp->alloc(nelem * elementSize);
if (!tmpElement) if (!firstElement)
return false; return false;
/* link all the new entries into the freelist */ /* prepare to link all the new entries into the freelist */
prevElement = NULL;
tmpElement = firstElement;
for (i = 0; i < nelem; i++) for (i = 0; i < nelem; i++)
{ {
tmpElement->link = hctl->freeList; tmpElement->link = prevElement;
hctl->freeList = tmpElement; prevElement = tmpElement;
tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize); tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
} }
/* if partitioned, must lock to touch freeList */
if (IS_PARTITIONED(hctlv))
SpinLockAcquire(&hctlv->mutex);
/* freelist could be nonempty if two backends did this concurrently */
firstElement->link = hctlv->freeList;
hctlv->freeList = prevElement;
if (IS_PARTITIONED(hctlv))
SpinLockRelease(&hctlv->mutex);
return true; return true;
} }
......
/*------------------------------------------------------------------------- /*-------------------------------------------------------------------------
* *
* hsearch.h * hsearch.h
* for hash tables, particularly hash tables in shared memory * exported definitions for utils/hash/dynahash.c; see notes therein
* *
* *
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/utils/hsearch.h,v 1.43 2006/06/25 18:29:49 tgl Exp $ * $PostgreSQL: pgsql/src/include/utils/hsearch.h,v 1.44 2006/07/22 23:04:39 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -43,27 +43,6 @@ typedef void *(*HashCopyFunc) (void *dest, const void *src, Size keysize); ...@@ -43,27 +43,6 @@ typedef void *(*HashCopyFunc) (void *dest, const void *src, Size keysize);
*/ */
typedef void *(*HashAllocFunc) (Size request); typedef void *(*HashAllocFunc) (Size request);
/*
* Constants
*
* A hash table has a top-level "directory", each of whose entries points
* to a "segment" of ssize bucket headers. The maximum number of hash
* buckets is thus dsize * ssize (but dsize may be expansible). Of course,
* the number of records in the table can be larger, but we don't want a
* whole lot of records per bucket or performance goes down.
*
* In a hash table allocated in shared memory, the directory cannot be
* expanded because it must stay at a fixed address. The directory size
* should be selected using hash_select_dirsize (and you'd better have
* a good idea of the maximum number of entries!). For non-shared hash
* tables, the initial directory size can be left at the default.
*/
#define DEF_SEGSIZE 256
#define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */
#define DEF_DIRSIZE 256
#define DEF_FFACTOR 1 /* default fill factor */
/* /*
* HASHELEMENT is the private part of a hashtable entry. The caller's data * HASHELEMENT is the private part of a hashtable entry. The caller's data
* follows the HASHELEMENT structure (on a MAXALIGN'd boundary). The hash key * follows the HASHELEMENT structure (on a MAXALIGN'd boundary). The hash key
...@@ -75,81 +54,42 @@ typedef struct HASHELEMENT ...@@ -75,81 +54,42 @@ typedef struct HASHELEMENT
uint32 hashvalue; /* hash function result for this entry */ uint32 hashvalue; /* hash function result for this entry */
} HASHELEMENT; } HASHELEMENT;
/* A hash bucket is a linked list of HASHELEMENTs */ /* Hash table header struct is an opaque type known only within dynahash.c */
typedef HASHELEMENT *HASHBUCKET; typedef struct HASHHDR HASHHDR;
/* A hash segment is an array of bucket headers */ /* Hash table control struct is an opaque type known only within dynahash.c */
typedef HASHBUCKET *HASHSEGMENT; typedef struct HTAB HTAB;
/* Header structure for a hash table --- contains all changeable info */
typedef struct HASHHDR
{
long dsize; /* Directory Size */
long ssize; /* Segment Size --- must be power of 2 */
int sshift; /* Segment shift = log2(ssize) */
uint32 max_bucket; /* ID of Maximum bucket in use */
uint32 high_mask; /* Mask to modulo into entire table */
uint32 low_mask; /* Mask to modulo into lower half of table */
long ffactor; /* Fill factor */
long nentries; /* Number of entries in hash table */
long nsegs; /* Number of allocated segments */
Size keysize; /* hash key length in bytes */
Size entrysize; /* total user element size in bytes */
long max_dsize; /* 'dsize' limit if directory is fixed size */
int nelem_alloc; /* number of entries to allocate at once */
HASHELEMENT *freeList; /* linked list of free elements */
#ifdef HASH_STATISTICS
long accesses;
long collisions;
#endif
} HASHHDR;
/*
* Top control structure for a hashtable --- need not be shared, since
* no fields change at runtime
*/
typedef struct HTAB
{
HASHHDR *hctl; /* shared control information */
HASHSEGMENT *dir; /* directory of segment starts */
HashValueFunc hash; /* hash function */
HashCompareFunc match; /* key comparison function */
HashCopyFunc keycopy; /* key copying function */
HashAllocFunc alloc; /* memory allocator */
MemoryContext hcxt; /* memory context if default allocator used */
char *tabname; /* table name (for error messages) */
bool isshared; /* true if table is in shared memory */
} HTAB;
/* Parameter data structure for hash_create */ /* Parameter data structure for hash_create */
/* Only those fields indicated by hash_flags need be set */ /* Only those fields indicated by hash_flags need be set */
typedef struct HASHCTL typedef struct HASHCTL
{ {
long ssize; /* Segment Size */ long num_partitions; /* # partitions (must be power of 2) */
long dsize; /* (initial) Directory Size */ long ssize; /* segment size */
long max_dsize; /* limit to dsize if directory size is limited */ long dsize; /* (initial) directory size */
long ffactor; /* Fill factor */ long max_dsize; /* limit to dsize if dir size is limited */
long ffactor; /* fill factor */
Size keysize; /* hash key length in bytes */ Size keysize; /* hash key length in bytes */
Size entrysize; /* total user element size in bytes */ Size entrysize; /* total user element size in bytes */
HashValueFunc hash; /* hash function */ HashValueFunc hash; /* hash function */
HashCompareFunc match; /* key comparison function */ HashCompareFunc match; /* key comparison function */
HashCopyFunc keycopy; /* key copying function */ HashCopyFunc keycopy; /* key copying function */
HashAllocFunc alloc; /* memory allocator */ HashAllocFunc alloc; /* memory allocator */
HASHSEGMENT *dir; /* directory of segment starts */
HASHHDR *hctl; /* location of header in shared mem */
MemoryContext hcxt; /* memory context to use for allocations */ MemoryContext hcxt; /* memory context to use for allocations */
HASHHDR *hctl; /* location of header in shared mem */
} HASHCTL; } HASHCTL;
/* Flags to indicate which parameters are supplied */ /* Flags to indicate which parameters are supplied */
#define HASH_PARTITION 0x001 /* Hashtable is used w/partitioned locking */
#define HASH_SEGMENT 0x002 /* Set segment size */ #define HASH_SEGMENT 0x002 /* Set segment size */
#define HASH_DIRSIZE 0x004 /* Set directory size */ #define HASH_DIRSIZE 0x004 /* Set directory size (initial and max) */
#define HASH_FFACTOR 0x008 /* Set fill factor */ #define HASH_FFACTOR 0x008 /* Set fill factor */
#define HASH_FUNCTION 0x010 /* Set user defined hash function */ #define HASH_FUNCTION 0x010 /* Set user defined hash function */
#define HASH_ELEM 0x020 /* Set key/entry size */ #define HASH_ELEM 0x020 /* Set keysize and entrysize */
#define HASH_SHARED_MEM 0x040 /* Hashtable is in shared memory */ #define HASH_SHARED_MEM 0x040 /* Hashtable is in shared memory */
#define HASH_ATTACH 0x080 /* Do not initialize hctl */ #define HASH_ATTACH 0x080 /* Do not initialize hctl */
#define HASH_ALLOC 0x100 /* Set memory allocator */ #define HASH_ALLOC 0x100 /* Set memory allocator */
#define HASH_CONTEXT 0x200 /* Set explicit memory context */ #define HASH_CONTEXT 0x200 /* Set memory allocation context */
#define HASH_COMPARE 0x400 /* Set user defined comparison function */ #define HASH_COMPARE 0x400 /* Set user defined comparison function */
#define HASH_KEYCOPY 0x800 /* Set user defined key-copying function */ #define HASH_KEYCOPY 0x800 /* Set user defined key-copying function */
...@@ -183,10 +123,16 @@ extern void hash_destroy(HTAB *hashp); ...@@ -183,10 +123,16 @@ extern void hash_destroy(HTAB *hashp);
extern void hash_stats(const char *where, HTAB *hashp); extern void hash_stats(const char *where, HTAB *hashp);
extern void *hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, extern void *hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action,
bool *foundPtr); bool *foundPtr);
extern uint32 get_hash_value(HTAB *hashp, const void *keyPtr);
extern void *hash_search_with_hash_value(HTAB *hashp, const void *keyPtr,
uint32 hashvalue, HASHACTION action,
bool *foundPtr);
extern long hash_get_num_entries(HTAB *hashp);
extern void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp); extern void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp);
extern void *hash_seq_search(HASH_SEQ_STATUS *status); extern void *hash_seq_search(HASH_SEQ_STATUS *status);
extern Size hash_estimate_size(long num_entries, Size entrysize); extern Size hash_estimate_size(long num_entries, Size entrysize);
extern long hash_select_dirsize(long num_entries); extern long hash_select_dirsize(long num_entries);
extern Size hash_get_shared_size(HASHCTL *info, int flags);
/* /*
* prototypes for functions in hashfn.c * prototypes for functions in hashfn.c
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment