Commit 44ca4022 authored by Robert Haas's avatar Robert Haas

Partition the freelist for shared dynahash tables.

Without this, contention on the freelist can become a pretty serious
problem on large servers.

Aleksander Alekseev, reviewed by Anastasia Lubennikova, Dilip Kumar,
and me.
parent ea4b8bd6
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
* to hash_create. This prevents any attempt to split buckets on-the-fly. * to hash_create. This prevents any attempt to split buckets on-the-fly.
* Therefore, each hash bucket chain operates independently, and no fields * Therefore, each hash bucket chain operates independently, and no fields
* of the hash header change after init except nentries and freeList. * of the hash header change after init except nentries and freeList.
* A partitioned table uses a spinlock to guard changes of those two fields. * A partitioned table uses spinlocks to guard changes of those fields.
* This lets any subset of the hash buckets be treated as a separately * This lets any subset of the hash buckets be treated as a separately
* lockable partition. We expect callers to use the low-order bits of a * lockable partition. We expect callers to use the low-order bits of a
* lookup key's hash value as a partition number --- this will work because * lookup key's hash value as a partition number --- this will work because
...@@ -111,6 +111,8 @@ ...@@ -111,6 +111,8 @@
#define DEF_DIRSIZE 256 #define DEF_DIRSIZE 256
#define DEF_FFACTOR 1 /* default fill factor */ #define DEF_FFACTOR 1 /* default fill factor */
/* Number of freelists to be used for a partitioned hash table. */
#define NUM_FREELISTS 32
/* A hash bucket is a linked list of HASHELEMENTs */ /* A hash bucket is a linked list of HASHELEMENTs */
typedef HASHELEMENT *HASHBUCKET; typedef HASHELEMENT *HASHBUCKET;
...@@ -118,6 +120,18 @@ typedef HASHELEMENT *HASHBUCKET; ...@@ -118,6 +120,18 @@ typedef HASHELEMENT *HASHBUCKET;
/* A hash segment is an array of bucket headers */ /* A hash segment is an array of bucket headers */
typedef HASHBUCKET *HASHSEGMENT; typedef HASHBUCKET *HASHSEGMENT;
/*
* Using array of FreeListData instead of separate arrays of mutexes, nentries
* and freeLists prevents, at least partially, sharing one cache line between
* different mutexes (see below).
*/
typedef struct
{
slock_t mutex; /* spinlock */
long nentries; /* number of entries */
HASHELEMENT *freeList; /* list of free elements */
} FreeListData;
/* /*
* Header structure for a hash table --- contains all changeable info * Header structure for a hash table --- contains all changeable info
* *
...@@ -128,12 +142,15 @@ typedef HASHBUCKET *HASHSEGMENT; ...@@ -128,12 +142,15 @@ typedef HASHBUCKET *HASHSEGMENT;
*/ */
struct HASHHDR struct HASHHDR
{ {
/* In a partitioned table, take this lock to touch nentries or freeList */ /*
slock_t mutex; /* unused if not partitioned table */ * The freelist can become a point of contention on high-concurrency hash
* tables, so we use an array of freelist, each with its own mutex and
/* These fields change during entry addition/deletion */ * nentries count, instead of just a single one.
long nentries; /* number of entries in hash table */ *
HASHELEMENT *freeList; /* linked list of free elements */ * If hash table is not partitioned only freeList[0] is used and spinlocks
* are not used at all.
*/
FreeListData freeList[NUM_FREELISTS];
/* These fields can change, but not in a partitioned table */ /* These fields can change, but not in a partitioned table */
/* Also, dsize can't change in a shared table, even if unpartitioned */ /* Also, dsize can't change in a shared table, even if unpartitioned */
...@@ -166,6 +183,9 @@ struct HASHHDR ...@@ -166,6 +183,9 @@ struct HASHHDR
#define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0) #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
#define FREELIST_IDX(hctl, hashcode) \
(IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0)
/* /*
* Top control structure for a hashtable --- in a shared table, each backend * Top control structure for a hashtable --- in a shared table, each backend
* has its own copy (OK since no fields change at runtime) * has its own copy (OK since no fields change at runtime)
...@@ -219,10 +239,10 @@ static long hash_accesses, ...@@ -219,10 +239,10 @@ static long hash_accesses,
*/ */
static void *DynaHashAlloc(Size size); static void *DynaHashAlloc(Size size);
static HASHSEGMENT seg_alloc(HTAB *hashp); static HASHSEGMENT seg_alloc(HTAB *hashp);
static bool element_alloc(HTAB *hashp, int nelem); static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
static bool dir_realloc(HTAB *hashp); static bool dir_realloc(HTAB *hashp);
static bool expand_table(HTAB *hashp); static bool expand_table(HTAB *hashp);
static HASHBUCKET get_hash_entry(HTAB *hashp); static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
static void hdefault(HTAB *hashp); static void hdefault(HTAB *hashp);
static int choose_nelem_alloc(Size entrysize); static int choose_nelem_alloc(Size entrysize);
static bool init_htab(HTAB *hashp, long nelem); static bool init_htab(HTAB *hashp, long nelem);
...@@ -482,11 +502,41 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) ...@@ -482,11 +502,41 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
if ((flags & HASH_SHARED_MEM) || if ((flags & HASH_SHARED_MEM) ||
nelem < hctl->nelem_alloc) nelem < hctl->nelem_alloc)
{ {
if (!element_alloc(hashp, (int) nelem)) int i,
freelist_partitions,
nelem_alloc,
nelem_alloc_first;
/*
* If hash table is partitioned all freeLists have equal number of
* elements. Otherwise only freeList[0] is used.
*/
if (IS_PARTITIONED(hashp->hctl))
freelist_partitions = NUM_FREELISTS;
else
freelist_partitions = 1;
nelem_alloc = nelem / freelist_partitions;
if (nelem_alloc == 0)
nelem_alloc = 1;
/* Make sure all memory will be used */
if (nelem_alloc * freelist_partitions < nelem)
nelem_alloc_first =
nelem - nelem_alloc * (freelist_partitions - 1);
else
nelem_alloc_first = nelem_alloc;
for (i = 0; i < freelist_partitions; i++)
{
int temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
if (!element_alloc(hashp, temp, i))
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY), (errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory"))); errmsg("out of memory")));
} }
}
if (flags & HASH_FIXED_SIZE) if (flags & HASH_FIXED_SIZE)
hashp->isfixed = true; hashp->isfixed = true;
...@@ -503,9 +553,6 @@ hdefault(HTAB *hashp) ...@@ -503,9 +553,6 @@ hdefault(HTAB *hashp)
MemSet(hctl, 0, sizeof(HASHHDR)); MemSet(hctl, 0, sizeof(HASHHDR));
hctl->nentries = 0;
hctl->freeList = NULL;
hctl->dsize = DEF_DIRSIZE; hctl->dsize = DEF_DIRSIZE;
hctl->nsegs = 0; hctl->nsegs = 0;
...@@ -572,12 +619,14 @@ init_htab(HTAB *hashp, long nelem) ...@@ -572,12 +619,14 @@ init_htab(HTAB *hashp, long nelem)
HASHSEGMENT *segp; HASHSEGMENT *segp;
int nbuckets; int nbuckets;
int nsegs; int nsegs;
int i;
/* /*
* initialize mutex if it's a partitioned table * initialize mutex if it's a partitioned table
*/ */
if (IS_PARTITIONED(hctl)) if (IS_PARTITIONED(hctl))
SpinLockInit(&hctl->mutex); for (i = 0; i < NUM_FREELISTS; i++)
SpinLockInit(&(hctl->freeList[i].mutex));
/* /*
* Divide number of elements by the fill factor to determine a desired * Divide number of elements by the fill factor to determine a desired
...@@ -648,7 +697,7 @@ init_htab(HTAB *hashp, long nelem) ...@@ -648,7 +697,7 @@ init_htab(HTAB *hashp, long nelem)
"HIGH MASK ", hctl->high_mask, "HIGH MASK ", hctl->high_mask,
"LOW MASK ", hctl->low_mask, "LOW MASK ", hctl->low_mask,
"NSEGS ", hctl->nsegs, "NSEGS ", hctl->nsegs,
"NENTRIES ", hctl->nentries); "NENTRIES ", hash_get_num_entries(hctl));
#endif #endif
return true; return true;
} }
...@@ -769,7 +818,7 @@ hash_stats(const char *where, HTAB *hashp) ...@@ -769,7 +818,7 @@ hash_stats(const char *where, HTAB *hashp)
where, hashp->hctl->accesses, hashp->hctl->collisions); where, hashp->hctl->accesses, hashp->hctl->collisions);
fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n", fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
hashp->hctl->nentries, (long) hashp->hctl->keysize, hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
hashp->hctl->max_bucket, hashp->hctl->nsegs); hashp->hctl->max_bucket, hashp->hctl->nsegs);
fprintf(stderr, "%s: total accesses %ld total collisions %ld\n", fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
where, hash_accesses, hash_collisions); where, hash_accesses, hash_collisions);
...@@ -863,6 +912,7 @@ hash_search_with_hash_value(HTAB *hashp, ...@@ -863,6 +912,7 @@ hash_search_with_hash_value(HTAB *hashp,
HASHBUCKET currBucket; HASHBUCKET currBucket;
HASHBUCKET *prevBucketPtr; HASHBUCKET *prevBucketPtr;
HashCompareFunc match; HashCompareFunc match;
int freelist_idx = FREELIST_IDX(hctl, hashvalue);
#if HASH_STATISTICS #if HASH_STATISTICS
hash_accesses++; hash_accesses++;
...@@ -885,7 +935,7 @@ hash_search_with_hash_value(HTAB *hashp, ...@@ -885,7 +935,7 @@ hash_search_with_hash_value(HTAB *hashp,
* order of these tests is to try to check cheaper conditions first. * order of these tests is to try to check cheaper conditions first.
*/ */
if (!IS_PARTITIONED(hctl) && !hashp->frozen && if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor && hctl->freeList[0].nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
!has_seq_scans(hashp)) !has_seq_scans(hashp))
(void) expand_table(hashp); (void) expand_table(hashp);
} }
...@@ -943,20 +993,20 @@ hash_search_with_hash_value(HTAB *hashp, ...@@ -943,20 +993,20 @@ hash_search_with_hash_value(HTAB *hashp,
{ {
/* if partitioned, must lock to touch nentries and freeList */ /* if partitioned, must lock to touch nentries and freeList */
if (IS_PARTITIONED(hctl)) if (IS_PARTITIONED(hctl))
SpinLockAcquire(&hctl->mutex); SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
Assert(hctl->nentries > 0); Assert(hctl->freeList[freelist_idx].nentries > 0);
hctl->nentries--; hctl->freeList[freelist_idx].nentries--;
/* remove record from hash bucket's chain. */ /* remove record from hash bucket's chain. */
*prevBucketPtr = currBucket->link; *prevBucketPtr = currBucket->link;
/* add the record to the freelist for this table. */ /* add the record to the freelist for this table. */
currBucket->link = hctl->freeList; currBucket->link = hctl->freeList[freelist_idx].freeList;
hctl->freeList = currBucket; hctl->freeList[freelist_idx].freeList = currBucket;
if (IS_PARTITIONED(hctl)) if (IS_PARTITIONED(hctl))
SpinLockRelease(&hctl->mutex); SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
/* /*
* better hope the caller is synchronizing access to this * better hope the caller is synchronizing access to this
...@@ -982,7 +1032,7 @@ hash_search_with_hash_value(HTAB *hashp, ...@@ -982,7 +1032,7 @@ hash_search_with_hash_value(HTAB *hashp,
elog(ERROR, "cannot insert into frozen hashtable \"%s\"", elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
hashp->tabname); hashp->tabname);
currBucket = get_hash_entry(hashp); currBucket = get_hash_entry(hashp, freelist_idx);
if (currBucket == NULL) if (currBucket == NULL)
{ {
/* out of memory */ /* out of memory */
...@@ -1175,39 +1225,69 @@ hash_update_hash_key(HTAB *hashp, ...@@ -1175,39 +1225,69 @@ hash_update_hash_key(HTAB *hashp,
* create a new entry if possible * create a new entry if possible
*/ */
static HASHBUCKET static HASHBUCKET
get_hash_entry(HTAB *hashp) get_hash_entry(HTAB *hashp, int freelist_idx)
{ {
HASHHDR *hctl = hashp->hctl; HASHHDR *hctl = hashp->hctl;
HASHBUCKET newElement; HASHBUCKET newElement;
int borrow_from_idx;
for (;;) for (;;)
{ {
/* if partitioned, must lock to touch nentries and freeList */ /* if partitioned, must lock to touch nentries and freeList */
if (IS_PARTITIONED(hctl)) if (IS_PARTITIONED(hctl))
SpinLockAcquire(&hctl->mutex); SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
/* try to get an entry from the freelist */ /* try to get an entry from the freelist */
newElement = hctl->freeList; newElement = hctl->freeList[freelist_idx].freeList;
if (newElement != NULL) if (newElement != NULL)
break; break;
/* no free elements. allocate another chunk of buckets */
if (IS_PARTITIONED(hctl)) if (IS_PARTITIONED(hctl))
SpinLockRelease(&hctl->mutex); SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
if (!element_alloc(hashp, hctl->nelem_alloc)) /* no free elements. allocate another chunk of buckets */
if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
{ {
/* out of memory */ if (!IS_PARTITIONED(hctl))
return NULL; return NULL; /* out of memory */
/* try to borrow element from another partition */
borrow_from_idx = freelist_idx;
for (;;)
{
borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
if (borrow_from_idx == freelist_idx)
break;
SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
newElement = hctl->freeList[borrow_from_idx].freeList;
if (newElement != NULL)
{
hctl->freeList[borrow_from_idx].freeList = newElement->link;
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
hctl->freeList[freelist_idx].nentries++;
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
break;
}
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
}
return newElement;
} }
} }
/* remove entry from freelist, bump nentries */ /* remove entry from freelist, bump nentries */
hctl->freeList = newElement->link; hctl->freeList[freelist_idx].freeList = newElement->link;
hctl->nentries++; hctl->freeList[freelist_idx].nentries++;
if (IS_PARTITIONED(hctl)) if (IS_PARTITIONED(hctl))
SpinLockRelease(&hctl->mutex); SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
return newElement; return newElement;
} }
...@@ -1218,11 +1298,21 @@ get_hash_entry(HTAB *hashp) ...@@ -1218,11 +1298,21 @@ get_hash_entry(HTAB *hashp)
long long
hash_get_num_entries(HTAB *hashp) hash_get_num_entries(HTAB *hashp)
{ {
int i;
long sum = hashp->hctl->freeList[0].nentries;
/* /*
* We currently don't bother with the mutex; it's only sensible to call * We currently don't bother with the mutex; it's only sensible to call
* this function if you've got lock on all partitions of the table. * this function if you've got lock on all partitions of the table.
*/ */
return hashp->hctl->nentries;
if (!IS_PARTITIONED(hashp->hctl))
return sum;
for (i = 1; i < NUM_FREELISTS; i++)
sum += hashp->hctl->freeList[i].nentries;
return sum;
} }
/* /*
...@@ -1527,10 +1617,10 @@ seg_alloc(HTAB *hashp) ...@@ -1527,10 +1617,10 @@ seg_alloc(HTAB *hashp)
} }
/* /*
* allocate some new elements and link them into the free list * allocate some new elements and link them into the indicated free list
*/ */
static bool static bool
element_alloc(HTAB *hashp, int nelem) element_alloc(HTAB *hashp, int nelem, int freelist_idx)
{ {
HASHHDR *hctl = hashp->hctl; HASHHDR *hctl = hashp->hctl;
Size elementSize; Size elementSize;
...@@ -1563,14 +1653,14 @@ element_alloc(HTAB *hashp, int nelem) ...@@ -1563,14 +1653,14 @@ element_alloc(HTAB *hashp, int nelem)
/* if partitioned, must lock to touch freeList */ /* if partitioned, must lock to touch freeList */
if (IS_PARTITIONED(hctl)) if (IS_PARTITIONED(hctl))
SpinLockAcquire(&hctl->mutex); SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
/* freelist could be nonempty if two backends did this concurrently */ /* freelist could be nonempty if two backends did this concurrently */
firstElement->link = hctl->freeList; firstElement->link = hctl->freeList[freelist_idx].freeList;
hctl->freeList = prevElement; hctl->freeList[freelist_idx].freeList = prevElement;
if (IS_PARTITIONED(hctl)) if (IS_PARTITIONED(hctl))
SpinLockRelease(&hctl->mutex); SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
return true; return true;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment