Commit ea69a0de authored by Robert Haas's avatar Robert Haas

Expand hash indexes more gradually.

Since hash indexes typically have very few overflow pages, adding a
new splitpoint essentially doubles the on-disk size of the index,
which can lead to large and abrupt increases in disk usage (and
perhaps long delays on occasion).  To mitigate this problem to some
degree, divide larger splitpoints into four equal phases.  This means
that, for example, instead of growing from 4GB to 8GB all at once, a
hash index will now grow from 4GB to 5GB to 6GB to 7GB to 8GB, which
is perhaps still not as smooth as we'd like but certainly an
improvement.

This changes the on-disk format of the metapage, so bump HASH_VERSION
from 2 to 3.  This will force a REINDEX of all existing hash indexes,
but that's probably a good idea anyway.  First, hash indexes from
pre-10 versions of PostgreSQL could easily be corrupted, and we don't
want to confuse corruption carried over from an older release with any
corruption caused despite the new write-ahead logging in v10.  Second,
it will let us remove some backward-compatibility code added by commit
293e24e5.

Mithun Cy, reviewed by Amit Kapila, Jesper Pedersen and me.  Regression
test outputs updated by me.

Discussion: http://postgr.es/m/CAD__OuhG6F1gQLCgMQNnMNgoCvOLQZz9zKYJQNYvYmmJoM42gA@mail.gmail.com
Discussion: http://postgr.es/m/CA+TgmoYty0jCf-pa+m+vYUJ716+AxM7nv_syvyanyf5O-L_i2A@mail.gmail.com
parent 334bf9c7
......@@ -45,7 +45,7 @@ lowmask, ovflpoint, firstfree, nmaps, procid, spares, mapp FROM
hash_metapage_info(get_raw_page('test_hash_a_idx', 0));
-[ RECORD 1 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
magic | 105121344
version | 2
version | 3
ntuples | 1
bsize | 8152
bmsize | 4096
......@@ -57,7 +57,7 @@ ovflpoint | 2
firstfree | 0
nmaps | 1
procid | 450
spares | {0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
spares | {0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
mapp | {5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
SELECT magic, version, ntuples, bsize, bmsize, bmshift, maxbucket, highmask,
......
......@@ -134,7 +134,7 @@ create index test_hashidx on test using hash (b);
select * from pgstathashindex('test_hashidx');
version | bucket_pages | overflow_pages | bitmap_pages | zero_pages | live_items | dead_items | free_percent
---------+--------------+----------------+--------------+------------+------------+------------+--------------
2 | 4 | 0 | 1 | 0 | 0 | 0 | 100
3 | 4 | 0 | 1 | 0 | 0 | 0 | 100
(1 row)
-- these should error with the wrong type
......@@ -235,7 +235,7 @@ select pgstatindex('test_partition_idx');
select pgstathashindex('test_partition_hash_idx');
pgstathashindex
---------------------
(2,8,0,1,0,0,0,100)
(3,8,0,1,0,0,0,100)
(1 row)
drop table test_partitioned;
......
......@@ -658,7 +658,7 @@ test=# SELECT * FROM hash_bitmap_info('con_hash_index', 2052);
test=# SELECT * FROM hash_metapage_info(get_raw_page('con_hash_index', 0));
-[ RECORD 1 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
magic | 105121344
version | 2
version | 3
ntuples | 500500
ffactor | 40
bsize | 8152
......@@ -667,11 +667,11 @@ bmshift | 15
maxbucket | 12512
highmask | 16383
lowmask | 8191
ovflpoint | 14
ovflpoint | 28
firstfree | 1204
nmaps | 1
procid | 450
spares | {0,0,0,0,0,0,1,1,1,1,1,4,59,704,1204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
spares | {0,0,0,0,0,0,1,1,1,1,1,1,1,1,3,4,4,4,45,55,58,59,508,567,628,704,1193,1202,1204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
mapp | {65,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
</screen>
</para>
......
......@@ -58,35 +58,51 @@ rules to support a variable number of overflow pages while not having to
move primary bucket pages around after they are created.
Primary bucket pages (henceforth just "bucket pages") are allocated in
power-of-2 groups, called "split points" in the code. Buckets 0 and 1
are created when the index is initialized. At the first split, buckets 2
and 3 are allocated; when bucket 4 is needed, buckets 4-7 are allocated;
when bucket 8 is needed, buckets 8-15 are allocated; etc. All the bucket
pages of a power-of-2 group appear consecutively in the index. This
addressing scheme allows the physical location of a bucket page to be
computed from the bucket number relatively easily, using only a small
amount of control information. We take the log2() of the bucket number
to determine which split point S the bucket belongs to, and then simply
add "hashm_spares[S] + 1" (where hashm_spares[] is an array stored in the
metapage) to compute the physical address. hashm_spares[S] can be
interpreted as the total number of overflow pages that have been allocated
before the bucket pages of splitpoint S. hashm_spares[0] is always 0,
so that buckets 0 and 1 (which belong to splitpoint 0) always appear at
block numbers 1 and 2, just after the meta page. We always have
hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
former. The difference between the two represents the number of overflow
pages appearing between the bucket page groups of splitpoints N and N+1.
power-of-2 groups, called "split points" in the code. That means at every new
splitpoint we double the existing number of buckets. Allocating huge chunks
of bucket pages all at once isn't optimal and we will take ages to consume
those. To avoid this exponential growth of index size, we did use a trick to
break up allocation of buckets at the splitpoint into 4 equal phases. If
(2 ^ x) are the total buckets need to be allocated at a splitpoint (from now on
we shall call this as a splitpoint group), then we allocate 1/4th (2 ^ (x - 2))
of total buckets at each phase of splitpoint group. Next quarter of allocation
will only happen if buckets of the previous phase have been already consumed.
For the initial splitpoint groups < 10 we will allocate all of their buckets in
single phase only, as number of buckets allocated at initial groups are small
in numbers. And for the groups >= 10 the allocation process is distributed
among four equal phases. At group 10 we allocate (2 ^ 9) buckets in 4
different phases {2 ^ 7, 2 ^ 7, 2 ^ 7, 2 ^ 7}, the numbers in curly braces
indicate the number of buckets allocated within each phase of splitpoint group
10. And, for splitpoint group 11 and 12 allocation phases will be
{2 ^ 8, 2 ^ 8, 2 ^ 8, 2 ^ 8} and {2 ^ 9, 2 ^ 9, 2 ^ 9, 2 ^ 9} respectively. We
can see that at each splitpoint group we double the total number of buckets
from the previous group but in an incremental phase. The bucket pages
allocated within one phase of a splitpoint group will appear consecutively in
the index. This addressing scheme allows the physical location of a bucket
page to be computed from the bucket number relatively easily, using only a
small amount of control information. If we look at the function
_hash_spareindex for a given bucket number we first compute the
splitpoint group it belongs to and then the phase to which the bucket belongs
to. Adding them we get the global splitpoint phase number S to which the
bucket belongs and then simply add "hashm_spares[S] + 1" (where hashm_spares[]
is an array stored in the metapage) with given bucket number to compute its
physical address. The hashm_spares[S] can be interpreted as the total number
of overflow pages that have been allocated before the bucket pages of
splitpoint phase S. The hashm_spares[0] is always 0, so that buckets 0 and 1
always appear at block numbers 1 and 2, just after the meta page. We always
have hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
former. The difference between the two represents the number of overflow pages
appearing between the bucket page groups of splitpoints phase N and N+1.
(Note: the above describes what happens when filling an initially minimally
sized hash index. In practice, we try to estimate the required index size
and allocate a suitable number of splitpoints immediately, to avoid
sized hash index. In practice, we try to estimate the required index size and
allocate a suitable number of splitpoints phases immediately, to avoid
expensive re-splitting during initial index build.)
When S splitpoints exist altogether, the array entries hashm_spares[0]
through hashm_spares[S] are valid; hashm_spares[S] records the current
total number of overflow pages. New overflow pages are created as needed
at the end of the index, and recorded by incrementing hashm_spares[S].
When it is time to create a new splitpoint's worth of bucket pages, we
When it is time to create a new splitpoint phase's worth of bucket pages, we
copy hashm_spares[S] into hashm_spares[S+1] and increment S (which is
stored in the hashm_ovflpoint field of the meta page). This has the
effect of reserving the correct number of bucket pages at the end of the
......@@ -101,7 +117,7 @@ We have to allow the case "greater than" because it's possible that during
an index extension we crash after allocating filesystem space and before
updating the metapage. Note that on filesystems that allow "holes" in
files, it's entirely likely that pages before the logical EOF are not yet
allocated: when we allocate a new splitpoint's worth of bucket pages, we
allocated: when we allocate a new splitpoint phase's worth of bucket pages, we
physically zero the last such page to force the EOF up, and the first such
page will be used immediately, but the intervening pages are not written
until needed.
......
......@@ -49,7 +49,7 @@ bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum)
* Convert to absolute page number by adding the number of bucket pages
* that exist before this split point.
*/
return (BlockNumber) ((1 << i) + ovflbitnum);
return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum);
}
/*
......@@ -67,14 +67,15 @@ _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
/* Determine the split number containing this page */
for (i = 1; i <= splitnum; i++)
{
if (ovflblkno <= (BlockNumber) (1 << i))
if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i))
break; /* oops */
bitnum = ovflblkno - (1 << i);
bitnum = ovflblkno - _hash_get_totalbuckets(i);
/*
* bitnum has to be greater than number of overflow page added in
* previous split point. The overflow page at this splitnum (i) if any
* should start from ((2 ^ i) + metap->hashm_spares[i - 1] + 1).
* should start from (_hash_get_totalbuckets(i) +
* metap->hashm_spares[i - 1] + 1).
*/
if (bitnum > metap->hashm_spares[i - 1] &&
bitnum <= metap->hashm_spares[i])
......
......@@ -502,14 +502,15 @@ _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
Page page;
double dnumbuckets;
uint32 num_buckets;
uint32 log2_num_buckets;
uint32 spare_index;
uint32 i;
/*
* Choose the number of initial bucket pages to match the fill factor
* given the estimated number of tuples. We round up the result to the
* next power of 2, however, and always force at least 2 bucket pages. The
* upper limit is determined by considerations explained in
* total number of buckets which has to be allocated before using its
* _hashm_spare element. However always force at least 2 bucket pages.
* The upper limit is determined by considerations explained in
* _hash_expandtable().
*/
dnumbuckets = num_tuples / ffactor;
......@@ -518,11 +519,10 @@ _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
else if (dnumbuckets >= (double) 0x40000000)
num_buckets = 0x40000000;
else
num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets);
num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets));
log2_num_buckets = _hash_log2(num_buckets);
Assert(num_buckets == (((uint32) 1) << log2_num_buckets));
Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS);
spare_index = _hash_spareindex(num_buckets);
Assert(spare_index < HASH_MAX_SPLITPOINTS);
page = BufferGetPage(buf);
if (initpage)
......@@ -563,18 +563,23 @@ _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
/*
* We initialize the index with N buckets, 0 .. N-1, occupying physical
* blocks 1 to N. The first freespace bitmap page is in block N+1. Since
* N is a power of 2, we can set the masks this way:
* blocks 1 to N. The first freespace bitmap page is in block N+1.
*/
metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1;
metap->hashm_highmask = (num_buckets << 1) - 1;
metap->hashm_maxbucket = num_buckets - 1;
/*
* Set highmask as next immediate ((2 ^ x) - 1), which should be sufficient
* to cover num_buckets.
*/
metap->hashm_highmask = (1 << (_hash_log2(num_buckets + 1))) - 1;
metap->hashm_lowmask = (metap->hashm_highmask >> 1);
MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
/* Set up mapping for one spare page after the initial splitpoints */
metap->hashm_spares[log2_num_buckets] = 1;
metap->hashm_ovflpoint = log2_num_buckets;
metap->hashm_spares[spare_index] = 1;
metap->hashm_ovflpoint = spare_index;
metap->hashm_firstfree = 0;
/*
......@@ -773,25 +778,25 @@ restart_expand:
start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
/*
* If the split point is increasing (hashm_maxbucket's log base 2
* increases), we need to allocate a new batch of bucket pages.
* If the split point is increasing we need to allocate a new batch of
* bucket pages.
*/
spare_ndx = _hash_log2(new_bucket + 1);
spare_ndx = _hash_spareindex(new_bucket + 1);
if (spare_ndx > metap->hashm_ovflpoint)
{
uint32 buckets_to_add;
Assert(spare_ndx == metap->hashm_ovflpoint + 1);
/*
* The number of buckets in the new splitpoint is equal to the total
* number already in existence, i.e. new_bucket. Currently this maps
* one-to-one to blocks required, but someday we may need a more
* complicated calculation here. We treat allocation of buckets as a
* separate WAL-logged action. Even if we fail after this operation,
* won't leak bucket pages; rather, the next split will consume this
* space. In any case, even without failure we don't use all the space
* in one split operation.
*/
if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket))
* We treat allocation of buckets as a separate WAL-logged action.
* Even if we fail after this operation, won't leak bucket pages;
* rather, the next split will consume this space. In any case, even
* without failure we don't use all the space in one split
* operation.
*/
buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket;
if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add))
{
/* can't split due to BlockNumber overflow */
_hash_relbuf(rel, buf_oblkno);
......@@ -836,10 +841,9 @@ restart_expand:
}
/*
* If the split point is increasing (hashm_maxbucket's log base 2
* increases), we need to adjust the hashm_spares[] array and
* hashm_ovflpoint so that future overflow pages will be created beyond
* this new batch of bucket pages.
* If the split point is increasing we need to adjust the hashm_spares[]
* array and hashm_ovflpoint so that future overflow pages will be created
* beyond this new batch of bucket pages.
*/
if (spare_ndx > metap->hashm_ovflpoint)
{
......
......@@ -37,7 +37,15 @@ struct HSpool
{
Tuplesortstate *sortstate; /* state data for tuplesort.c */
Relation index;
uint32 hash_mask; /* bitmask for hash codes */
/*
* We sort the hash keys based on the buckets they belong to. Below masks
* are used in _hash_hashkey2bucket to determine the bucket of given hash
* key.
*/
uint32 high_mask;
uint32 low_mask;
uint32 max_buckets;
};
......@@ -56,11 +64,12 @@ _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
* num_buckets buckets in the index, the appropriate mask can be computed
* as follows.
*
* Note: at present, the passed-in num_buckets is always a power of 2, so
* we could just compute num_buckets - 1. We prefer not to assume that
* here, though.
* NOTE : This hash mask calculation should be in sync with similar
* calculation in _hash_init_metabuffer.
*/
hspool->hash_mask = (((uint32) 1) << _hash_log2(num_buckets)) - 1;
hspool->high_mask = (((uint32) 1) << _hash_log2(num_buckets + 1)) - 1;
hspool->low_mask = (hspool->high_mask >> 1);
hspool->max_buckets = num_buckets - 1;
/*
* We size the sort area as maintenance_work_mem rather than work_mem to
......@@ -69,7 +78,9 @@ _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
*/
hspool->sortstate = tuplesort_begin_index_hash(heap,
index,
hspool->hash_mask,
hspool->high_mask,
hspool->low_mask,
hspool->max_buckets,
maintenance_work_mem,
false);
......@@ -122,7 +133,9 @@ _h_indexbuild(HSpool *hspool, Relation heapRel)
#ifdef USE_ASSERT_CHECKING
uint32 lasthashkey = hashkey;
hashkey = _hash_get_indextuple_hashkey(itup) & hspool->hash_mask;
hashkey = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
hspool->max_buckets, hspool->high_mask,
hspool->low_mask);
Assert(hashkey >= lasthashkey);
#endif
......
......@@ -149,6 +149,71 @@ _hash_log2(uint32 num)
return i;
}
/*
* _hash_spareindex -- returns spare index / global splitpoint phase of the
* bucket
*/
uint32
_hash_spareindex(uint32 num_bucket)
{
uint32 splitpoint_group;
uint32 splitpoint_phases;
splitpoint_group = _hash_log2(num_bucket);
if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
return splitpoint_group;
/* account for single-phase groups */
splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
/* account for multi-phase groups before splitpoint_group */
splitpoint_phases +=
((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) <<
HASH_SPLITPOINT_PHASE_BITS);
/* account for phases within current group */
splitpoint_phases +=
(((num_bucket - 1) >> (HASH_SPLITPOINT_PHASE_BITS + 1)) &
HASH_SPLITPOINT_PHASE_MASK); /* to 0-based value. */
return splitpoint_phases;
}
/*
* _hash_get_totalbuckets -- returns total number of buckets allocated till
* the given splitpoint phase.
*/
uint32
_hash_get_totalbuckets(uint32 splitpoint_phase)
{
uint32 splitpoint_group;
uint32 total_buckets;
uint32 phases_within_splitpoint_group;
if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
return (1 << splitpoint_phase);
/* get splitpoint's group */
splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
splitpoint_group +=
((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >>
HASH_SPLITPOINT_PHASE_BITS);
/* account for buckets before splitpoint_group */
total_buckets = (1 << (splitpoint_group - 1));
/* account for buckets within splitpoint_group */
phases_within_splitpoint_group =
(((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) &
HASH_SPLITPOINT_PHASE_MASK) + 1); /* from 0-based to 1-based */
total_buckets +=
(((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) *
phases_within_splitpoint_group);
return total_buckets;
}
/*
* _hash_checkpage -- sanity checks on the format of all hash pages
*
......
......@@ -127,6 +127,7 @@
#include "access/htup_details.h"
#include "access/nbtree.h"
#include "access/hash.h"
#include "catalog/index.h"
#include "catalog/pg_am.h"
#include "commands/tablespace.h"
......@@ -473,7 +474,9 @@ struct Tuplesortstate
bool enforceUnique; /* complain if we find duplicate tuples */
/* These are specific to the index_hash subcase: */
uint32 hash_mask; /* mask for sortable part of hash code */
uint32 high_mask; /* masks for sortable part of hash code */
uint32 low_mask;
uint32 max_buckets;
/*
* These variables are specific to the Datum case; they are set by
......@@ -991,7 +994,9 @@ tuplesort_begin_index_btree(Relation heapRel,
Tuplesortstate *
tuplesort_begin_index_hash(Relation heapRel,
Relation indexRel,
uint32 hash_mask,
uint32 high_mask,
uint32 low_mask,
uint32 max_buckets,
int workMem, bool randomAccess)
{
Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
......@@ -1002,8 +1007,11 @@ tuplesort_begin_index_hash(Relation heapRel,
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"begin index sort: hash_mask = 0x%x, workMem = %d, randomAccess = %c",
hash_mask,
"begin index sort: high_mask = 0x%x, low_mask = 0x%x, "
"max_buckets = 0x%x, workMem = %d, randomAccess = %c",
high_mask,
low_mask,
max_buckets,
workMem, randomAccess ? 't' : 'f');
#endif
......@@ -1017,7 +1025,9 @@ tuplesort_begin_index_hash(Relation heapRel,
state->heapRel = heapRel;
state->indexRel = indexRel;
state->hash_mask = hash_mask;
state->high_mask = high_mask;
state->low_mask = low_mask;
state->max_buckets = max_buckets;
MemoryContextSwitchTo(oldcontext);
......@@ -4157,8 +4167,8 @@ static int
comparetup_index_hash(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state)
{
uint32 hash1;
uint32 hash2;
Bucket bucket1;
Bucket bucket2;
IndexTuple tuple1;
IndexTuple tuple2;
......@@ -4167,13 +4177,16 @@ comparetup_index_hash(const SortTuple *a, const SortTuple *b,
* that the first column of the index tuple is the hash key.
*/
Assert(!a->isnull1);
hash1 = DatumGetUInt32(a->datum1) & state->hash_mask;
bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1),
state->max_buckets, state->high_mask,
state->low_mask);
Assert(!b->isnull1);
hash2 = DatumGetUInt32(b->datum1) & state->hash_mask;
if (hash1 > hash2)
bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1),
state->max_buckets, state->high_mask,
state->low_mask);
if (bucket1 > bucket2)
return 1;
else if (hash1 < hash2)
else if (bucket1 < bucket2)
return -1;
/*
......
......@@ -36,7 +36,7 @@ typedef uint32 Bucket;
#define InvalidBucket ((Bucket) 0xFFFFFFFF)
#define BUCKET_TO_BLKNO(metap,B) \
((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1)
((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B)+1)-1] : 0)) + 1)
/*
* Special space for hash index pages.
......@@ -158,7 +158,8 @@ typedef HashScanOpaqueData *HashScanOpaque;
#define HASH_METAPAGE 0 /* metapage is always block 0 */
#define HASH_MAGIC 0x6440640
#define HASH_VERSION 2 /* 2 signifies only hash key value is stored */
#define HASH_VERSION 3 /* 3 signifies multi-phased bucket allocation
* to reduce doubling */
/*
* spares[] holds the number of overflow pages currently allocated at or
......@@ -176,13 +177,28 @@ typedef HashScanOpaqueData *HashScanOpaque;
*
* The limitation on the size of spares[] comes from the fact that there's
* no point in having more than 2^32 buckets with only uint32 hashcodes.
* (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is
* adjusted in such a way to accommodate multi phased allocation of buckets
* after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE).
*
* There is no particular upper limit on the size of mapp[], other than
* needing to fit into the metapage. (With 8K block size, 128 bitmaps
* limit us to 64 GB of overflow space...)
*/
#define HASH_MAX_SPLITPOINTS 32
#define HASH_MAX_BITMAPS 128
#define HASH_SPLITPOINT_PHASE_BITS 2
#define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS)
#define HASH_SPLITPOINT_PHASE_MASK (HASH_SPLITPOINT_PHASES_PER_GRP - 1)
#define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE 10
/* defines max number of splitpoit phases a hash index can have */
#define HASH_MAX_SPLITPOINT_GROUP 32
#define HASH_MAX_SPLITPOINTS \
(((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \
HASH_SPLITPOINT_PHASES_PER_GRP) + \
HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
typedef struct HashMetaPageData
{
uint32 hashm_magic; /* magic no. for hash tables */
......@@ -382,6 +398,8 @@ extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype);
extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
uint32 highmask, uint32 lowmask);
extern uint32 _hash_log2(uint32 num);
extern uint32 _hash_spareindex(uint32 num_bucket);
extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase);
extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
extern bool _hash_convert_tuple(Relation index,
......
......@@ -72,7 +72,9 @@ extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel,
int workMem, bool randomAccess);
extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel,
Relation indexRel,
uint32 hash_mask,
uint32 high_mask,
uint32 low_mask,
uint32 max_buckets,
int workMem, bool randomAccess);
extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
Oid sortOperator, Oid sortCollation,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment