Commit ab2324fd authored by Tom Lane's avatar Tom Lane

Improve comments about partitioned hash table freelists.

While I couldn't find any live bugs in commit 44ca4022, the comments
seemed pretty far from adequate; in particular it was not made plain that
"borrowing" entries from other freelists is critical for correctness.
Try to improve the commentary.  A couple of very minor code style
tweaks, as well.

Discussion: https://postgr.es/m/10593.1500670709@sss.pgh.pa.us
parent 991c8b04
...@@ -15,7 +15,8 @@ ...@@ -15,7 +15,8 @@
* to hash_create. This prevents any attempt to split buckets on-the-fly. * to hash_create. This prevents any attempt to split buckets on-the-fly.
* Therefore, each hash bucket chain operates independently, and no fields * Therefore, each hash bucket chain operates independently, and no fields
* of the hash header change after init except nentries and freeList. * of the hash header change after init except nentries and freeList.
* A partitioned table uses spinlocks to guard changes of those fields. * (A partitioned table uses multiple copies of those fields, guarded by
* spinlocks, for additional concurrency.)
* This lets any subset of the hash buckets be treated as a separately * This lets any subset of the hash buckets be treated as a separately
* lockable partition. We expect callers to use the low-order bits of a * lockable partition. We expect callers to use the low-order bits of a
* lookup key's hash value as a partition number --- this will work because * lookup key's hash value as a partition number --- this will work because
...@@ -121,15 +122,27 @@ typedef HASHELEMENT *HASHBUCKET; ...@@ -121,15 +122,27 @@ typedef HASHELEMENT *HASHBUCKET;
typedef HASHBUCKET *HASHSEGMENT; typedef HASHBUCKET *HASHSEGMENT;
/* /*
* Using array of FreeListData instead of separate arrays of mutexes, nentries * Per-freelist data.
* and freeLists prevents, at least partially, sharing one cache line between *
* different mutexes (see below). * In a partitioned hash table, each freelist is associated with a specific
* set of hashcodes, as determined by the FREELIST_IDX() macro below.
* nentries tracks the number of live hashtable entries having those hashcodes
* (NOT the number of entries in the freelist, as you might expect).
*
* The coverage of a freelist might be more or less than one partition, so it
* needs its own lock rather than relying on caller locking. Relying on that
* wouldn't work even if the coverage was the same, because of the occasional
* need to "borrow" entries from another freelist; see get_hash_entry().
*
* Using an array of FreeListData instead of separate arrays of mutexes,
* nentries and freeLists helps to reduce sharing of cache lines between
* different mutexes.
*/ */
typedef struct typedef struct
{ {
slock_t mutex; /* spinlock */ slock_t mutex; /* spinlock for this freelist */
long nentries; /* number of entries */ long nentries; /* number of entries in associated buckets */
HASHELEMENT *freeList; /* list of free elements */ HASHELEMENT *freeList; /* chain of free elements */
} FreeListData; } FreeListData;
/* /*
...@@ -143,12 +156,14 @@ typedef struct ...@@ -143,12 +156,14 @@ typedef struct
struct HASHHDR struct HASHHDR
{ {
/* /*
* The freelist can become a point of contention on high-concurrency hash * The freelist can become a point of contention in high-concurrency hash
* tables, so we use an array of freelist, each with its own mutex and * tables, so we use an array of freelists, each with its own mutex and
* nentries count, instead of just a single one. * nentries count, instead of just a single one. Although the freelists
* normally operate independently, we will scavenge entries from freelists
* other than a hashcode's default freelist when necessary.
* *
* If hash table is not partitioned only freeList[0] is used and spinlocks * If the hash table is not partitioned, only freeList[0] is used and its
* are not used at all. * spinlock is not used at all; callers' locking is assumed sufficient.
*/ */
FreeListData freeList[NUM_FREELISTS]; FreeListData freeList[NUM_FREELISTS];
...@@ -184,7 +199,7 @@ struct HASHHDR ...@@ -184,7 +199,7 @@ struct HASHHDR
#define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0) #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
#define FREELIST_IDX(hctl, hashcode) \ #define FREELIST_IDX(hctl, hashcode) \
(IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0) (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
/* /*
* Top control structure for a hashtable --- in a shared table, each backend * Top control structure for a hashtable --- in a shared table, each backend
...@@ -506,8 +521,8 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) ...@@ -506,8 +521,8 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
nelem_alloc_first; nelem_alloc_first;
/* /*
* If hash table is partitioned all freeLists have equal number of * If hash table is partitioned, give each freelist an equal share of
* elements. Otherwise only freeList[0] is used. * the initial allocation. Otherwise only freeList[0] is used.
*/ */
if (IS_PARTITIONED(hashp->hctl)) if (IS_PARTITIONED(hashp->hctl))
freelist_partitions = NUM_FREELISTS; freelist_partitions = NUM_FREELISTS;
...@@ -515,10 +530,13 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) ...@@ -515,10 +530,13 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
freelist_partitions = 1; freelist_partitions = 1;
nelem_alloc = nelem / freelist_partitions; nelem_alloc = nelem / freelist_partitions;
if (nelem_alloc == 0) if (nelem_alloc <= 0)
nelem_alloc = 1; nelem_alloc = 1;
/* Make sure all memory will be used */ /*
* Make sure we'll allocate all the requested elements; freeList[0]
* gets the excess if the request isn't divisible by NUM_FREELISTS.
*/
if (nelem_alloc * freelist_partitions < nelem) if (nelem_alloc * freelist_partitions < nelem)
nelem_alloc_first = nelem_alloc_first =
nelem - nelem_alloc * (freelist_partitions - 1); nelem - nelem_alloc * (freelist_partitions - 1);
...@@ -620,7 +638,7 @@ init_htab(HTAB *hashp, long nelem) ...@@ -620,7 +638,7 @@ init_htab(HTAB *hashp, long nelem)
int i; int i;
/* /*
* initialize mutex if it's a partitioned table * initialize mutexes if it's a partitioned table
*/ */
if (IS_PARTITIONED(hctl)) if (IS_PARTITIONED(hctl))
for (i = 0; i < NUM_FREELISTS; i++) for (i = 0; i < NUM_FREELISTS; i++)
...@@ -902,6 +920,7 @@ hash_search_with_hash_value(HTAB *hashp, ...@@ -902,6 +920,7 @@ hash_search_with_hash_value(HTAB *hashp,
bool *foundPtr) bool *foundPtr)
{ {
HASHHDR *hctl = hashp->hctl; HASHHDR *hctl = hashp->hctl;
int freelist_idx = FREELIST_IDX(hctl, hashvalue);
Size keysize; Size keysize;
uint32 bucket; uint32 bucket;
long segment_num; long segment_num;
...@@ -910,7 +929,6 @@ hash_search_with_hash_value(HTAB *hashp, ...@@ -910,7 +929,6 @@ hash_search_with_hash_value(HTAB *hashp,
HASHBUCKET currBucket; HASHBUCKET currBucket;
HASHBUCKET *prevBucketPtr; HASHBUCKET *prevBucketPtr;
HashCompareFunc match; HashCompareFunc match;
int freelist_idx = FREELIST_IDX(hctl, hashvalue);
#if HASH_STATISTICS #if HASH_STATISTICS
hash_accesses++; hash_accesses++;
...@@ -993,13 +1011,14 @@ hash_search_with_hash_value(HTAB *hashp, ...@@ -993,13 +1011,14 @@ hash_search_with_hash_value(HTAB *hashp,
if (IS_PARTITIONED(hctl)) if (IS_PARTITIONED(hctl))
SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex)); SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
/* delete the record from the appropriate nentries counter. */
Assert(hctl->freeList[freelist_idx].nentries > 0); Assert(hctl->freeList[freelist_idx].nentries > 0);
hctl->freeList[freelist_idx].nentries--; hctl->freeList[freelist_idx].nentries--;
/* remove record from hash bucket's chain. */ /* remove record from hash bucket's chain. */
*prevBucketPtr = currBucket->link; *prevBucketPtr = currBucket->link;
/* add the record to the freelist for this table. */ /* add the record to the appropriate freelist. */
currBucket->link = hctl->freeList[freelist_idx].freeList; currBucket->link = hctl->freeList[freelist_idx].freeList;
hctl->freeList[freelist_idx].freeList = currBucket; hctl->freeList[freelist_idx].freeList = currBucket;
...@@ -1220,14 +1239,15 @@ hash_update_hash_key(HTAB *hashp, ...@@ -1220,14 +1239,15 @@ hash_update_hash_key(HTAB *hashp,
} }
/* /*
* create a new entry if possible * Allocate a new hashtable entry if possible; return NULL if out of memory.
* (Or, if the underlying space allocator throws error for out-of-memory,
* we won't return at all.)
*/ */
static HASHBUCKET static HASHBUCKET
get_hash_entry(HTAB *hashp, int freelist_idx) get_hash_entry(HTAB *hashp, int freelist_idx)
{ {
HASHHDR *hctl = hashp->hctl; HASHHDR *hctl = hashp->hctl;
HASHBUCKET newElement; HASHBUCKET newElement;
int borrow_from_idx;
for (;;) for (;;)
{ {
...@@ -1244,19 +1264,32 @@ get_hash_entry(HTAB *hashp, int freelist_idx) ...@@ -1244,19 +1264,32 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
if (IS_PARTITIONED(hctl)) if (IS_PARTITIONED(hctl))
SpinLockRelease(&hctl->freeList[freelist_idx].mutex); SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
/* no free elements. allocate another chunk of buckets */ /*
* No free elements in this freelist. In a partitioned table, there
* might be entries in other freelists, but to reduce contention we
* prefer to first try to get another chunk of buckets from the main
* shmem allocator. If that fails, though, we *MUST* root through all
* the other freelists before giving up. There are multiple callers
* that assume that they can allocate every element in the initially
* requested table size, or that deleting an element guarantees they
* can insert a new element, even if shared memory is entirely full.
* Failing because the needed element is in a different freelist is
* not acceptable.
*/
if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx)) if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
{ {
int borrow_from_idx;
if (!IS_PARTITIONED(hctl)) if (!IS_PARTITIONED(hctl))
return NULL; /* out of memory */ return NULL; /* out of memory */
/* try to borrow element from another partition */ /* try to borrow element from another freelist */
borrow_from_idx = freelist_idx; borrow_from_idx = freelist_idx;
for (;;) for (;;)
{ {
borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS; borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
if (borrow_from_idx == freelist_idx) if (borrow_from_idx == freelist_idx)
break; break; /* examined all freelists, fail */
SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex)); SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
newElement = hctl->freeList[borrow_from_idx].freeList; newElement = hctl->freeList[borrow_from_idx].freeList;
...@@ -1266,17 +1299,19 @@ get_hash_entry(HTAB *hashp, int freelist_idx) ...@@ -1266,17 +1299,19 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
hctl->freeList[borrow_from_idx].freeList = newElement->link; hctl->freeList[borrow_from_idx].freeList = newElement->link;
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex)); SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
/* careful: count the new element in its proper freelist */
SpinLockAcquire(&hctl->freeList[freelist_idx].mutex); SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
hctl->freeList[freelist_idx].nentries++; hctl->freeList[freelist_idx].nentries++;
SpinLockRelease(&hctl->freeList[freelist_idx].mutex); SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
break; return newElement;
} }
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex)); SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
} }
return newElement; /* no elements available to borrow either, so out of memory */
return NULL;
} }
} }
...@@ -1300,15 +1335,15 @@ hash_get_num_entries(HTAB *hashp) ...@@ -1300,15 +1335,15 @@ hash_get_num_entries(HTAB *hashp)
long sum = hashp->hctl->freeList[0].nentries; long sum = hashp->hctl->freeList[0].nentries;
/* /*
* We currently don't bother with the mutex; it's only sensible to call * We currently don't bother with acquiring the mutexes; it's only
* this function if you've got lock on all partitions of the table. * sensible to call this function if you've got lock on all partitions of
* the table.
*/ */
if (IS_PARTITIONED(hashp->hctl))
if (!IS_PARTITIONED(hashp->hctl)) {
return sum; for (i = 1; i < NUM_FREELISTS; i++)
sum += hashp->hctl->freeList[i].nentries;
for (i = 1; i < NUM_FREELISTS; i++) }
sum += hashp->hctl->freeList[i].nentries;
return sum; return sum;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment