Commit 75ae538b authored by Andres Freund's avatar Andres Freund

Use more efficient hashtable for tidbitmap.c to speed up bitmap scans.

Use the new simplehash.h to speed up tidbitmap.c uses. For bitmap scan
heavy queries speedups of over 100% have been measured. Both lossy and
exact scans benefit, but the wins are bigger for mostly exact scans.

The conversion is mostly trivial, except that tbm_lossify() now restarts
lossifying at the point it previously stopped. Otherwise the hash table
becomes unbalanced because the scan in done in hash-order, leaving the
end of the hashtable more densely filled then the beginning. That caused
performance issues with dynahash as well, but due to the open chaining
they were less pronounced than with the linear adressing from
simplehash.h.

Reviewed-By: Tomas Vondra
Discussion: <20160727004333.r3e2k2y6fvk2ntup@alap3.anarazel.de>
parent b30d3ea8
...@@ -43,7 +43,6 @@ ...@@ -43,7 +43,6 @@
#include "access/htup_details.h" #include "access/htup_details.h"
#include "nodes/bitmapset.h" #include "nodes/bitmapset.h"
#include "nodes/tidbitmap.h" #include "nodes/tidbitmap.h"
#include "utils/hsearch.h"
/* /*
* The maximum number of tuples per page is not large (typically 256 with * The maximum number of tuples per page is not large (typically 256 with
...@@ -61,12 +60,12 @@ ...@@ -61,12 +60,12 @@
* for that page in the page table. * for that page in the page table.
* *
* We actually store both exact pages and lossy chunks in the same hash * We actually store both exact pages and lossy chunks in the same hash
* table, using identical data structures. (This is because dynahash.c's * table, using identical data structures. (This is because the memory
* memory management doesn't allow space to be transferred easily from one * management for hashtables doesn't easily/efficiently allow space to be
* hashtable to another.) Therefore it's best if PAGES_PER_CHUNK is the * transferred easily from one hashtable to another.) Therefore it's best
* same as MAX_TUPLES_PER_PAGE, or at least not too different. But we * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
* also want PAGES_PER_CHUNK to be a power of 2 to avoid expensive integer * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
* remainder operations. So, define it like this: * avoid expensive integer remainder operations. So, define it like this:
*/ */
#define PAGES_PER_CHUNK (BLCKSZ / 32) #define PAGES_PER_CHUNK (BLCKSZ / 32)
...@@ -97,21 +96,22 @@ ...@@ -97,21 +96,22 @@
typedef struct PagetableEntry typedef struct PagetableEntry
{ {
BlockNumber blockno; /* page number (hashtable key) */ BlockNumber blockno; /* page number (hashtable key) */
char status; /* hash entry status */
bool ischunk; /* T = lossy storage, F = exact */ bool ischunk; /* T = lossy storage, F = exact */
bool recheck; /* should the tuples be rechecked? */ bool recheck; /* should the tuples be rechecked? */
bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)]; bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
} PagetableEntry; } PagetableEntry;
/* /*
* dynahash.c is optimized for relatively large, long-lived hash tables. * We want to avoid the overhead of creating the hashtable, which is
* This is not ideal for TIDBitMap, particularly when we are using a bitmap * comparatively large, when not necessary. Particularly when we are using a
* scan on the inside of a nestloop join: a bitmap may well live only long * bitmap scan on the inside of a nestloop join: a bitmap may well live only
* enough to accumulate one entry in such cases. We therefore avoid creating * long enough to accumulate one entry in such cases. We therefore avoid
* an actual hashtable until we need two pagetable entries. When just one * creating an actual hashtable until we need two pagetable entries. When
* pagetable entry is needed, we store it in a fixed field of TIDBitMap. * just one pagetable entry is needed, we store it in a fixed field of
* (NOTE: we don't get rid of the hashtable if the bitmap later shrinks down * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
* to zero or one page again. So, status can be TBM_HASH even when nentries * shrinks down to zero or one page again. So, status can be TBM_HASH even
* is zero or one.) * when nentries is zero or one.)
*/ */
typedef enum typedef enum
{ {
...@@ -128,12 +128,13 @@ struct TIDBitmap ...@@ -128,12 +128,13 @@ struct TIDBitmap
NodeTag type; /* to make it a valid Node */ NodeTag type; /* to make it a valid Node */
MemoryContext mcxt; /* memory context containing me */ MemoryContext mcxt; /* memory context containing me */
TBMStatus status; /* see codes above */ TBMStatus status; /* see codes above */
HTAB *pagetable; /* hash table of PagetableEntry's */ struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
int nentries; /* number of entries in pagetable */ int nentries; /* number of entries in pagetable */
int maxentries; /* limit on same to meet maxbytes */ int maxentries; /* limit on same to meet maxbytes */
int npages; /* number of exact entries in pagetable */ int npages; /* number of exact entries in pagetable */
int nchunks; /* number of lossy entries in pagetable */ int nchunks; /* number of lossy entries in pagetable */
bool iterating; /* tbm_begin_iterate called? */ bool iterating; /* tbm_begin_iterate called? */
uint32 lossify_start; /* offset to start lossifying hashtable at */
PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */ PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
/* these are valid when iterating is true: */ /* these are valid when iterating is true: */
PagetableEntry **spages; /* sorted exact-page list, or NULL */ PagetableEntry **spages; /* sorted exact-page list, or NULL */
...@@ -168,6 +169,35 @@ static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno); ...@@ -168,6 +169,35 @@ static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
static void tbm_lossify(TIDBitmap *tbm); static void tbm_lossify(TIDBitmap *tbm);
static int tbm_comparator(const void *left, const void *right); static int tbm_comparator(const void *left, const void *right);
/*
* Simple inline murmur hash implementation for the exact width required, for
* performance.
*/
static inline uint32
hash_blockno(BlockNumber b)
{
uint32 h = b;
h ^= h >> 16;
h *= 0x85ebca6b;
h ^= h >> 13;
h *= 0xc2b2ae35;
h ^= h >> 16;
return h;
}
/* define hashtable mapping block numbers to PagetableEntry's */
#define SH_PREFIX pagetable
#define SH_ELEMENT_TYPE PagetableEntry
#define SH_KEY_TYPE BlockNumber
#define SH_KEY blockno
#define SH_HASH_KEY(tb, key) hash_blockno(key)
#define SH_EQUAL(tb, a, b) a == b
#define SH_SCOPE static inline
#define SH_DEFINE
#define SH_DECLARE
#include "lib/simplehash.h"
/* /*
* tbm_create - create an initially-empty bitmap * tbm_create - create an initially-empty bitmap
...@@ -190,17 +220,16 @@ tbm_create(long maxbytes) ...@@ -190,17 +220,16 @@ tbm_create(long maxbytes)
/* /*
* Estimate number of hashtable entries we can have within maxbytes. This * Estimate number of hashtable entries we can have within maxbytes. This
* estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT)) plus a * estimates the hash cost as sizeof(PagetableEntry), which is good enough
* pointer per hash entry, which is crude but good enough for our purpose. * for our purpose. Also count an extra Pointer per entry for the arrays
* Also count an extra Pointer per entry for the arrays created during * created during iteration readout.
* iteration readout.
*/ */
nbuckets = maxbytes / nbuckets = maxbytes /
(MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry)) (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
+ sizeof(Pointer) + sizeof(Pointer));
nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */ nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
nbuckets = Max(nbuckets, 16); /* sanity limit */ nbuckets = Max(nbuckets, 16); /* sanity limit */
tbm->maxentries = (int) nbuckets; tbm->maxentries = (int) nbuckets;
tbm->lossify_start = 0;
return tbm; return tbm;
} }
...@@ -212,32 +241,25 @@ tbm_create(long maxbytes) ...@@ -212,32 +241,25 @@ tbm_create(long maxbytes)
static void static void
tbm_create_pagetable(TIDBitmap *tbm) tbm_create_pagetable(TIDBitmap *tbm)
{ {
HASHCTL hash_ctl;
Assert(tbm->status != TBM_HASH); Assert(tbm->status != TBM_HASH);
Assert(tbm->pagetable == NULL); Assert(tbm->pagetable == NULL);
/* Create the hashtable proper */ tbm->pagetable = pagetable_create(tbm->mcxt, 128);
MemSet(&hash_ctl, 0, sizeof(hash_ctl));
hash_ctl.keysize = sizeof(BlockNumber);
hash_ctl.entrysize = sizeof(PagetableEntry);
hash_ctl.hcxt = tbm->mcxt;
tbm->pagetable = hash_create("TIDBitmap",
128, /* start small and extend */
&hash_ctl,
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
/* If entry1 is valid, push it into the hashtable */ /* If entry1 is valid, push it into the hashtable */
if (tbm->status == TBM_ONE_PAGE) if (tbm->status == TBM_ONE_PAGE)
{ {
PagetableEntry *page; PagetableEntry *page;
bool found; bool found;
char oldstatus;
page = (PagetableEntry *) hash_search(tbm->pagetable, page = pagetable_insert(tbm->pagetable,
(void *) &tbm->entry1.blockno, tbm->entry1.blockno,
HASH_ENTER, &found); &found);
Assert(!found); Assert(!found);
oldstatus = page->status;
memcpy(page, &tbm->entry1, sizeof(PagetableEntry)); memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
page->status = oldstatus;
} }
tbm->status = TBM_HASH; tbm->status = TBM_HASH;
...@@ -250,7 +272,7 @@ void ...@@ -250,7 +272,7 @@ void
tbm_free(TIDBitmap *tbm) tbm_free(TIDBitmap *tbm)
{ {
if (tbm->pagetable) if (tbm->pagetable)
hash_destroy(tbm->pagetable); pagetable_destroy(tbm->pagetable);
if (tbm->spages) if (tbm->spages)
pfree(tbm->spages); pfree(tbm->spages);
if (tbm->schunks) if (tbm->schunks)
...@@ -357,12 +379,12 @@ tbm_union(TIDBitmap *a, const TIDBitmap *b) ...@@ -357,12 +379,12 @@ tbm_union(TIDBitmap *a, const TIDBitmap *b)
tbm_union_page(a, &b->entry1); tbm_union_page(a, &b->entry1);
else else
{ {
HASH_SEQ_STATUS status; pagetable_iterator i;
PagetableEntry *bpage; PagetableEntry *bpage;
Assert(b->status == TBM_HASH); Assert(b->status == TBM_HASH);
hash_seq_init(&status, b->pagetable); pagetable_start_iterate(b->pagetable, &i);
while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL) while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
tbm_union_page(a, bpage); tbm_union_page(a, bpage);
} }
} }
...@@ -449,12 +471,12 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b) ...@@ -449,12 +471,12 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
} }
else else
{ {
HASH_SEQ_STATUS status; pagetable_iterator i;
PagetableEntry *apage; PagetableEntry *apage;
Assert(a->status == TBM_HASH); Assert(a->status == TBM_HASH);
hash_seq_init(&status, a->pagetable); pagetable_start_iterate(a->pagetable, &i);
while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL) while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
{ {
if (tbm_intersect_page(a, apage, b)) if (tbm_intersect_page(a, apage, b))
{ {
...@@ -464,9 +486,7 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b) ...@@ -464,9 +486,7 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
else else
a->npages--; a->npages--;
a->nentries--; a->nentries--;
if (hash_search(a->pagetable, if (!pagetable_delete(a->pagetable, apage->blockno))
(void *) &apage->blockno,
HASH_REMOVE, NULL) == NULL)
elog(ERROR, "hash table corrupted"); elog(ERROR, "hash table corrupted");
} }
} }
...@@ -606,7 +626,7 @@ tbm_begin_iterate(TIDBitmap *tbm) ...@@ -606,7 +626,7 @@ tbm_begin_iterate(TIDBitmap *tbm)
*/ */
if (tbm->status == TBM_HASH && !tbm->iterating) if (tbm->status == TBM_HASH && !tbm->iterating)
{ {
HASH_SEQ_STATUS status; pagetable_iterator i;
PagetableEntry *page; PagetableEntry *page;
int npages; int npages;
int nchunks; int nchunks;
...@@ -620,9 +640,9 @@ tbm_begin_iterate(TIDBitmap *tbm) ...@@ -620,9 +640,9 @@ tbm_begin_iterate(TIDBitmap *tbm)
MemoryContextAlloc(tbm->mcxt, MemoryContextAlloc(tbm->mcxt,
tbm->nchunks * sizeof(PagetableEntry *)); tbm->nchunks * sizeof(PagetableEntry *));
hash_seq_init(&status, tbm->pagetable);
npages = nchunks = 0; npages = nchunks = 0;
while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL) pagetable_start_iterate(tbm->pagetable, &i);
while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
{ {
if (page->ischunk) if (page->ischunk)
tbm->schunks[nchunks++] = page; tbm->schunks[nchunks++] = page;
...@@ -791,9 +811,7 @@ tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno) ...@@ -791,9 +811,7 @@ tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
return page; return page;
} }
page = (PagetableEntry *) hash_search(tbm->pagetable, page = pagetable_lookup(tbm->pagetable, pageno);
(void *) &pageno,
HASH_FIND, NULL);
if (page == NULL) if (page == NULL)
return NULL; return NULL;
if (page->ischunk) if (page->ischunk)
...@@ -834,15 +852,16 @@ tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno) ...@@ -834,15 +852,16 @@ tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
} }
/* Look up or create an entry */ /* Look up or create an entry */
page = (PagetableEntry *) hash_search(tbm->pagetable, page = pagetable_insert(tbm->pagetable, pageno, &found);
(void *) &pageno,
HASH_ENTER, &found);
} }
/* Initialize it if not present before */ /* Initialize it if not present before */
if (!found) if (!found)
{ {
char oldstatus = page->status;
MemSet(page, 0, sizeof(PagetableEntry)); MemSet(page, 0, sizeof(PagetableEntry));
page->status = oldstatus;
page->blockno = pageno; page->blockno = pageno;
/* must count it too */ /* must count it too */
tbm->nentries++; tbm->nentries++;
...@@ -869,9 +888,9 @@ tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno) ...@@ -869,9 +888,9 @@ tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
bitno = pageno % PAGES_PER_CHUNK; bitno = pageno % PAGES_PER_CHUNK;
chunk_pageno = pageno - bitno; chunk_pageno = pageno - bitno;
page = (PagetableEntry *) hash_search(tbm->pagetable,
(void *) &chunk_pageno, page = pagetable_lookup(tbm->pagetable, chunk_pageno);
HASH_FIND, NULL);
if (page != NULL && page->ischunk) if (page != NULL && page->ischunk)
{ {
int wordnum = WORDNUM(bitno); int wordnum = WORDNUM(bitno);
...@@ -912,9 +931,7 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno) ...@@ -912,9 +931,7 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
*/ */
if (bitno != 0) if (bitno != 0)
{ {
if (hash_search(tbm->pagetable, if (pagetable_delete(tbm->pagetable, pageno))
(void *) &pageno,
HASH_REMOVE, NULL) != NULL)
{ {
/* It was present, so adjust counts */ /* It was present, so adjust counts */
tbm->nentries--; tbm->nentries--;
...@@ -923,14 +940,15 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno) ...@@ -923,14 +940,15 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
} }
/* Look up or create entry for chunk-header page */ /* Look up or create entry for chunk-header page */
page = (PagetableEntry *) hash_search(tbm->pagetable, page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
(void *) &chunk_pageno,
HASH_ENTER, &found);
/* Initialize it if not present before */ /* Initialize it if not present before */
if (!found) if (!found)
{ {
char oldstatus = page->status;
MemSet(page, 0, sizeof(PagetableEntry)); MemSet(page, 0, sizeof(PagetableEntry));
page->status = oldstatus;
page->blockno = chunk_pageno; page->blockno = chunk_pageno;
page->ischunk = true; page->ischunk = true;
/* must count it too */ /* must count it too */
...@@ -939,8 +957,11 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno) ...@@ -939,8 +957,11 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
} }
else if (!page->ischunk) else if (!page->ischunk)
{ {
char oldstatus = page->status;
/* chunk header page was formerly non-lossy, make it lossy */ /* chunk header page was formerly non-lossy, make it lossy */
MemSet(page, 0, sizeof(PagetableEntry)); MemSet(page, 0, sizeof(PagetableEntry));
page->status = oldstatus;
page->blockno = chunk_pageno; page->blockno = chunk_pageno;
page->ischunk = true; page->ischunk = true;
/* we assume it had some tuple bit(s) set, so mark it lossy */ /* we assume it had some tuple bit(s) set, so mark it lossy */
...@@ -962,7 +983,7 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno) ...@@ -962,7 +983,7 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
static void static void
tbm_lossify(TIDBitmap *tbm) tbm_lossify(TIDBitmap *tbm)
{ {
HASH_SEQ_STATUS status; pagetable_iterator i;
PagetableEntry *page; PagetableEntry *page;
/* /*
...@@ -977,8 +998,8 @@ tbm_lossify(TIDBitmap *tbm) ...@@ -977,8 +998,8 @@ tbm_lossify(TIDBitmap *tbm)
Assert(!tbm->iterating); Assert(!tbm->iterating);
Assert(tbm->status == TBM_HASH); Assert(tbm->status == TBM_HASH);
hash_seq_init(&status, tbm->pagetable); pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL) while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
{ {
if (page->ischunk) if (page->ischunk)
continue; /* already a chunk header */ continue; /* already a chunk header */
...@@ -995,15 +1016,19 @@ tbm_lossify(TIDBitmap *tbm) ...@@ -995,15 +1016,19 @@ tbm_lossify(TIDBitmap *tbm)
if (tbm->nentries <= tbm->maxentries / 2) if (tbm->nentries <= tbm->maxentries / 2)
{ {
/* we have done enough */ /*
hash_seq_term(&status); * We have made enough room. Remember where to start lossifying
* next round, so we evenly iterate over the hashtable.
*/
tbm->lossify_start = i.cur;
break; break;
} }
/* /*
* Note: tbm_mark_page_lossy may have inserted a lossy chunk into the * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
* hashtable. We can continue the same seq_search scan since we do * hashtable and may have deleted the non-lossy chunk. We can
* not care whether we visit lossy chunks or not. * continue the same hash table scan, since failure to visit one
* element or visiting the newly inserted element, isn't fatal.
*/ */
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment