Commit 6d46f478 authored by Robert Haas's avatar Robert Haas

Improve hash index bucket split behavior.

Previously, the right to split a bucket was represented by a
heavyweight lock on the page number of the primary bucket page.
Unfortunately, this meant that every scan needed to take a heavyweight
lock on that bucket also, which was bad for concurrency.  Instead, use
a cleanup lock on the primary bucket page to indicate the right to
begin a split, so that scans only need to retain a pin on that page,
which is they would have to acquire anyway, and which is also much
cheaper.

In addition to reducing the locking cost, this also avoids locking out
scans and inserts for the entire lifetime of the split: while the new
bucket is being populated with copies of the appropriate tuples from
the old bucket, scans and inserts can happen in parallel.  There are
minor concurrency improvements for vacuum operations as well, though
the situation there is still far from ideal.

This patch also removes the unworldly assumption that a split will
never be interrupted.  With the new code, a split is done in a series
of small steps and the system can pick up where it left off if it is
interrupted prior to completion.  While this patch does not itself add
write-ahead logging for hash indexes, it is clearly a necessary first
step, since one of the things that could interrupt a split is the
removal of electrical power from the machine performing it.

Amit Kapila.  I wrote the original design on which this patch is
based, and did a good bit of work on the comments and README through
multiple rounds of review, but all of the code is Amit's.  Also
reviewed by Jesper Pedersen, Jeff Janes, and others.

Discussion: http://postgr.es/m/CAA4eK1LfzcZYxLoXS874Ad0+S-ZM60U9bwcyiUZx9mHZ-KCWhw@mail.gmail.com
parent 213c0f2d
...@@ -12,7 +12,7 @@ subdir = src/backend/access/hash ...@@ -12,7 +12,7 @@ subdir = src/backend/access/hash
top_builddir = ../../../.. top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global include $(top_builddir)/src/Makefile.global
OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \ OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashsearch.o \
hashsearch.o hashsort.o hashutil.o hashvalidate.o hashsort.o hashutil.o hashvalidate.o
include $(top_srcdir)/src/backend/common.mk include $(top_srcdir)/src/backend/common.mk
This diff is collapsed.
This diff is collapsed.
...@@ -28,18 +28,22 @@ ...@@ -28,18 +28,22 @@
void void
_hash_doinsert(Relation rel, IndexTuple itup) _hash_doinsert(Relation rel, IndexTuple itup)
{ {
Buffer buf; Buffer buf = InvalidBuffer;
Buffer bucket_buf;
Buffer metabuf; Buffer metabuf;
HashMetaPage metap; HashMetaPage metap;
BlockNumber blkno; BlockNumber blkno;
BlockNumber oldblkno = InvalidBlockNumber; BlockNumber oldblkno;
bool retry = false; bool retry;
Page page; Page page;
HashPageOpaque pageopaque; HashPageOpaque pageopaque;
Size itemsz; Size itemsz;
bool do_expand; bool do_expand;
uint32 hashkey; uint32 hashkey;
Bucket bucket; Bucket bucket;
uint32 maxbucket;
uint32 highmask;
uint32 lowmask;
/* /*
* Get the hash key for the item (it's stored in the index tuple itself). * Get the hash key for the item (it's stored in the index tuple itself).
...@@ -51,6 +55,7 @@ _hash_doinsert(Relation rel, IndexTuple itup) ...@@ -51,6 +55,7 @@ _hash_doinsert(Relation rel, IndexTuple itup)
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
* need to be consistent */ * need to be consistent */
restart_insert:
/* Read the metapage */ /* Read the metapage */
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = HashPageGetMeta(BufferGetPage(metabuf)); metap = HashPageGetMeta(BufferGetPage(metabuf));
...@@ -69,6 +74,9 @@ _hash_doinsert(Relation rel, IndexTuple itup) ...@@ -69,6 +74,9 @@ _hash_doinsert(Relation rel, IndexTuple itup)
itemsz, HashMaxItemSize((Page) metap)), itemsz, HashMaxItemSize((Page) metap)),
errhint("Values larger than a buffer page cannot be indexed."))); errhint("Values larger than a buffer page cannot be indexed.")));
oldblkno = InvalidBlockNumber;
retry = false;
/* /*
* Loop until we get a lock on the correct target bucket. * Loop until we get a lock on the correct target bucket.
*/ */
...@@ -84,21 +92,32 @@ _hash_doinsert(Relation rel, IndexTuple itup) ...@@ -84,21 +92,32 @@ _hash_doinsert(Relation rel, IndexTuple itup)
blkno = BUCKET_TO_BLKNO(metap, bucket); blkno = BUCKET_TO_BLKNO(metap, bucket);
/*
* Copy bucket mapping info now; refer the comment in
* _hash_expandtable where we copy this information before calling
* _hash_splitbucket to see why this is okay.
*/
maxbucket = metap->hashm_maxbucket;
highmask = metap->hashm_highmask;
lowmask = metap->hashm_lowmask;
/* Release metapage lock, but keep pin. */ /* Release metapage lock, but keep pin. */
_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
/* /*
* If the previous iteration of this loop locked what is still the * If the previous iteration of this loop locked the primary page of
* correct target bucket, we are done. Otherwise, drop any old lock * what is still the correct target bucket, we are done. Otherwise,
* and lock what now appears to be the correct bucket. * drop any old lock before acquiring the new one.
*/ */
if (retry) if (retry)
{ {
if (oldblkno == blkno) if (oldblkno == blkno)
break; break;
_hash_droplock(rel, oldblkno, HASH_SHARE); _hash_relbuf(rel, buf);
} }
_hash_getlock(rel, blkno, HASH_SHARE);
/* Fetch and lock the primary bucket page for the target bucket */
buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
/* /*
* Reacquire metapage lock and check that no bucket split has taken * Reacquire metapage lock and check that no bucket split has taken
...@@ -109,12 +128,36 @@ _hash_doinsert(Relation rel, IndexTuple itup) ...@@ -109,12 +128,36 @@ _hash_doinsert(Relation rel, IndexTuple itup)
retry = true; retry = true;
} }
/* Fetch the primary bucket page for the bucket */ /* remember the primary bucket buffer to release the pin on it at end. */
buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE); bucket_buf = buf;
page = BufferGetPage(buf); page = BufferGetPage(buf);
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
Assert(pageopaque->hasho_bucket == bucket); Assert(pageopaque->hasho_bucket == bucket);
/*
* If this bucket is in the process of being split, try to finish the
* split before inserting, because that might create room for the
* insertion to proceed without allocating an additional overflow page.
* It's only interesting to finish the split if we're trying to insert
* into the bucket from which we're removing tuples (the "old" bucket),
* not if we're trying to insert into the bucket into which tuples are
* being moved (the "new" bucket).
*/
if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf))
{
/* release the lock on bucket buffer, before completing the split. */
_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
_hash_finish_split(rel, metabuf, buf, pageopaque->hasho_bucket,
maxbucket, highmask, lowmask);
/* release the pin on old and meta buffer. retry for insert. */
_hash_dropbuf(rel, buf);
_hash_dropbuf(rel, metabuf);
goto restart_insert;
}
/* Do the insertion */ /* Do the insertion */
while (PageGetFreeSpace(page) < itemsz) while (PageGetFreeSpace(page) < itemsz)
{ {
...@@ -127,9 +170,15 @@ _hash_doinsert(Relation rel, IndexTuple itup) ...@@ -127,9 +170,15 @@ _hash_doinsert(Relation rel, IndexTuple itup)
{ {
/* /*
* ovfl page exists; go get it. if it doesn't have room, we'll * ovfl page exists; go get it. if it doesn't have room, we'll
* find out next pass through the loop test above. * find out next pass through the loop test above. we always
* release both the lock and pin if this is an overflow page, but
* only the lock if this is the primary bucket page, since the pin
* on the primary bucket must be retained throughout the scan.
*/ */
_hash_relbuf(rel, buf); if (buf != bucket_buf)
_hash_relbuf(rel, buf);
else
_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
page = BufferGetPage(buf); page = BufferGetPage(buf);
} }
...@@ -144,7 +193,7 @@ _hash_doinsert(Relation rel, IndexTuple itup) ...@@ -144,7 +193,7 @@ _hash_doinsert(Relation rel, IndexTuple itup)
_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
/* chain to a new overflow page */ /* chain to a new overflow page */
buf = _hash_addovflpage(rel, metabuf, buf); buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false);
page = BufferGetPage(buf); page = BufferGetPage(buf);
/* should fit now, given test above */ /* should fit now, given test above */
...@@ -158,11 +207,14 @@ _hash_doinsert(Relation rel, IndexTuple itup) ...@@ -158,11 +207,14 @@ _hash_doinsert(Relation rel, IndexTuple itup)
/* found page with enough space, so add the item here */ /* found page with enough space, so add the item here */
(void) _hash_pgaddtup(rel, buf, itemsz, itup); (void) _hash_pgaddtup(rel, buf, itemsz, itup);
/* write and release the modified page */ /*
* write and release the modified page. if the page we modified was an
* overflow page, we also need to separately drop the pin we retained on
* the primary bucket page.
*/
_hash_wrtbuf(rel, buf); _hash_wrtbuf(rel, buf);
if (buf != bucket_buf)
/* We can drop the bucket lock now */ _hash_dropbuf(rel, bucket_buf);
_hash_droplock(rel, blkno, HASH_SHARE);
/* /*
* Write-lock the metapage so we can increment the tuple count. After * Write-lock the metapage so we can increment the tuple count. After
......
This diff is collapsed.
This diff is collapsed.
/*-------------------------------------------------------------------------
*
* hashscan.c
* manage scans on hash tables
*
* Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/access/hash/hashscan.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/hash.h"
#include "access/relscan.h"
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/resowner.h"
/*
* We track all of a backend's active scans on hash indexes using a list
* of HashScanListData structs, which are allocated in TopMemoryContext.
* It's okay to use a long-lived context because we rely on the ResourceOwner
* mechanism to clean up unused entries after transaction or subtransaction
* abort. We can't safely keep the entries in the executor's per-query
* context, because that might be already freed before we get a chance to
* clean up the list. (XXX seems like there should be a better way to
* manage this...)
*/
typedef struct HashScanListData
{
IndexScanDesc hashsl_scan;
ResourceOwner hashsl_owner;
struct HashScanListData *hashsl_next;
} HashScanListData;
typedef HashScanListData *HashScanList;
static HashScanList HashScans = NULL;
/*
* ReleaseResources_hash() --- clean up hash subsystem resources.
*
* This is here because it needs to touch this module's static var HashScans.
*/
void
ReleaseResources_hash(void)
{
HashScanList l;
HashScanList prev;
HashScanList next;
/*
* Release all HashScanList items belonging to the current ResourceOwner.
* Note that we do not release the underlying IndexScanDesc; that's in
* executor memory and will go away on its own (in fact quite possibly has
* gone away already, so we mustn't try to touch it here).
*
* Note: this should be a no-op during normal query shutdown. However, in
* an abort situation ExecutorEnd is not called and so there may be open
* index scans to clean up.
*/
prev = NULL;
for (l = HashScans; l != NULL; l = next)
{
next = l->hashsl_next;
if (l->hashsl_owner == CurrentResourceOwner)
{
if (prev == NULL)
HashScans = next;
else
prev->hashsl_next = next;
pfree(l);
/* prev does not change */
}
else
prev = l;
}
}
/*
* _hash_regscan() -- register a new scan.
*/
void
_hash_regscan(IndexScanDesc scan)
{
HashScanList new_el;
new_el = (HashScanList) MemoryContextAlloc(TopMemoryContext,
sizeof(HashScanListData));
new_el->hashsl_scan = scan;
new_el->hashsl_owner = CurrentResourceOwner;
new_el->hashsl_next = HashScans;
HashScans = new_el;
}
/*
* _hash_dropscan() -- drop a scan from the scan list
*/
void
_hash_dropscan(IndexScanDesc scan)
{
HashScanList chk,
last;
last = NULL;
for (chk = HashScans;
chk != NULL && chk->hashsl_scan != scan;
chk = chk->hashsl_next)
last = chk;
if (chk == NULL)
elog(ERROR, "hash scan list trashed; cannot find 0x%p", (void *) scan);
if (last == NULL)
HashScans = chk->hashsl_next;
else
last->hashsl_next = chk->hashsl_next;
pfree(chk);
}
/*
* Is there an active scan in this bucket?
*/
bool
_hash_has_active_scan(Relation rel, Bucket bucket)
{
Oid relid = RelationGetRelid(rel);
HashScanList l;
for (l = HashScans; l != NULL; l = l->hashsl_next)
{
if (relid == l->hashsl_scan->indexRelation->rd_id)
{
HashScanOpaque so = (HashScanOpaque) l->hashsl_scan->opaque;
if (so->hashso_bucket_valid &&
so->hashso_bucket == bucket)
return true;
}
}
return false;
}
This diff is collapsed.
...@@ -20,6 +20,8 @@ ...@@ -20,6 +20,8 @@
#include "utils/lsyscache.h" #include "utils/lsyscache.h"
#include "utils/rel.h" #include "utils/rel.h"
#define CALC_NEW_BUCKET(old_bucket, lowmask) \
old_bucket | (lowmask + 1)
/* /*
* _hash_checkqual -- does the index tuple satisfy the scan conditions? * _hash_checkqual -- does the index tuple satisfy the scan conditions?
...@@ -352,3 +354,95 @@ _hash_binsearch_last(Page page, uint32 hash_value) ...@@ -352,3 +354,95 @@ _hash_binsearch_last(Page page, uint32 hash_value)
return lower; return lower;
} }
/*
* _hash_get_oldblock_from_newbucket() -- get the block number of a bucket
* from which current (new) bucket is being split.
*/
BlockNumber
_hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
{
Bucket old_bucket;
uint32 mask;
Buffer metabuf;
HashMetaPage metap;
BlockNumber blkno;
/*
* To get the old bucket from the current bucket, we need a mask to modulo
* into lower half of table. This mask is stored in meta page as
* hashm_lowmask, but here we can't rely on the same, because we need a
* value of lowmask that was prevalent at the time when bucket split was
* started. Masking the most significant bit of new bucket would give us
* old bucket.
*/
mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1;
old_bucket = new_bucket & mask;
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = HashPageGetMeta(BufferGetPage(metabuf));
blkno = BUCKET_TO_BLKNO(metap, old_bucket);
_hash_relbuf(rel, metabuf);
return blkno;
}
/*
* _hash_get_newblock_from_oldbucket() -- get the block number of a bucket
* that will be generated after split from old bucket.
*
* This is used to find the new bucket from old bucket based on current table
* half. It is mainly required to finish the incomplete splits where we are
* sure that not more than one bucket could have split in progress from old
* bucket.
*/
BlockNumber
_hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
{
Bucket new_bucket;
Buffer metabuf;
HashMetaPage metap;
BlockNumber blkno;
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = HashPageGetMeta(BufferGetPage(metabuf));
new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket,
metap->hashm_lowmask,
metap->hashm_maxbucket);
blkno = BUCKET_TO_BLKNO(metap, new_bucket);
_hash_relbuf(rel, metabuf);
return blkno;
}
/*
* _hash_get_newbucket_from_oldbucket() -- get the new bucket that will be
* generated after split from current (old) bucket.
*
* This is used to find the new bucket from old bucket. New bucket can be
* obtained by OR'ing old bucket with most significant bit of current table
* half (lowmask passed in this function can be used to identify msb of
* current table half). There could be multiple buckets that could have
* been split from current bucket. We need the first such bucket that exists.
* Caller must ensure that no more than one split has happened from old
* bucket.
*/
Bucket
_hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
uint32 lowmask, uint32 maxbucket)
{
Bucket new_bucket;
new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
if (new_bucket > maxbucket)
{
lowmask = lowmask >> 1;
new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
}
return new_bucket;
}
...@@ -668,9 +668,6 @@ ResourceOwnerReleaseInternal(ResourceOwner owner, ...@@ -668,9 +668,6 @@ ResourceOwnerReleaseInternal(ResourceOwner owner,
PrintFileLeakWarning(res); PrintFileLeakWarning(res);
FileClose(res); FileClose(res);
} }
/* Clean up index scans too */
ReleaseResources_hash();
} }
/* Let add-on modules get a chance too */ /* Let add-on modules get a chance too */
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "lib/stringinfo.h" #include "lib/stringinfo.h"
#include "storage/bufmgr.h" #include "storage/bufmgr.h"
#include "storage/lockdefs.h" #include "storage/lockdefs.h"
#include "utils/hsearch.h"
#include "utils/relcache.h" #include "utils/relcache.h"
/* /*
...@@ -32,6 +33,8 @@ ...@@ -32,6 +33,8 @@
*/ */
typedef uint32 Bucket; typedef uint32 Bucket;
#define InvalidBucket ((Bucket) 0xFFFFFFFF)
#define BUCKET_TO_BLKNO(metap,B) \ #define BUCKET_TO_BLKNO(metap,B) \
((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1) ((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1)
...@@ -51,6 +54,9 @@ typedef uint32 Bucket; ...@@ -51,6 +54,9 @@ typedef uint32 Bucket;
#define LH_BUCKET_PAGE (1 << 1) #define LH_BUCKET_PAGE (1 << 1)
#define LH_BITMAP_PAGE (1 << 2) #define LH_BITMAP_PAGE (1 << 2)
#define LH_META_PAGE (1 << 3) #define LH_META_PAGE (1 << 3)
#define LH_BUCKET_BEING_POPULATED (1 << 4)
#define LH_BUCKET_BEING_SPLIT (1 << 5)
#define LH_BUCKET_NEEDS_SPLIT_CLEANUP (1 << 6)
typedef struct HashPageOpaqueData typedef struct HashPageOpaqueData
{ {
...@@ -63,6 +69,10 @@ typedef struct HashPageOpaqueData ...@@ -63,6 +69,10 @@ typedef struct HashPageOpaqueData
typedef HashPageOpaqueData *HashPageOpaque; typedef HashPageOpaqueData *HashPageOpaque;
#define H_NEEDS_SPLIT_CLEANUP(opaque) ((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP)
#define H_BUCKET_BEING_SPLIT(opaque) ((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT)
#define H_BUCKET_BEING_POPULATED(opaque) ((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED)
/* /*
* The page ID is for the convenience of pg_filedump and similar utilities, * The page ID is for the convenience of pg_filedump and similar utilities,
* which otherwise would have a hard time telling pages of different index * which otherwise would have a hard time telling pages of different index
...@@ -79,19 +89,6 @@ typedef struct HashScanOpaqueData ...@@ -79,19 +89,6 @@ typedef struct HashScanOpaqueData
/* Hash value of the scan key, ie, the hash key we seek */ /* Hash value of the scan key, ie, the hash key we seek */
uint32 hashso_sk_hash; uint32 hashso_sk_hash;
/*
* By definition, a hash scan should be examining only one bucket. We
* record the bucket number here as soon as it is known.
*/
Bucket hashso_bucket;
bool hashso_bucket_valid;
/*
* If we have a share lock on the bucket, we record it here. When
* hashso_bucket_blkno is zero, we have no such lock.
*/
BlockNumber hashso_bucket_blkno;
/* /*
* We also want to remember which buffer we're currently examining in the * We also want to remember which buffer we're currently examining in the
* scan. We keep the buffer pinned (but not locked) across hashgettuple * scan. We keep the buffer pinned (but not locked) across hashgettuple
...@@ -100,11 +97,30 @@ typedef struct HashScanOpaqueData ...@@ -100,11 +97,30 @@ typedef struct HashScanOpaqueData
*/ */
Buffer hashso_curbuf; Buffer hashso_curbuf;
/* remember the buffer associated with primary bucket */
Buffer hashso_bucket_buf;
/*
* remember the buffer associated with primary bucket page of bucket being
* split. it is required during the scan of the bucket which is being
* populated during split operation.
*/
Buffer hashso_split_bucket_buf;
/* Current position of the scan, as an index TID */ /* Current position of the scan, as an index TID */
ItemPointerData hashso_curpos; ItemPointerData hashso_curpos;
/* Current position of the scan, as a heap TID */ /* Current position of the scan, as a heap TID */
ItemPointerData hashso_heappos; ItemPointerData hashso_heappos;
/* Whether scan starts on bucket being populated due to split */
bool hashso_buc_populated;
/*
* Whether scanning bucket being split? The value of this parameter is
* referred only when hashso_buc_populated is true.
*/
bool hashso_buc_split;
} HashScanOpaqueData; } HashScanOpaqueData;
typedef HashScanOpaqueData *HashScanOpaque; typedef HashScanOpaqueData *HashScanOpaque;
...@@ -175,6 +191,8 @@ typedef HashMetaPageData *HashMetaPage; ...@@ -175,6 +191,8 @@ typedef HashMetaPageData *HashMetaPage;
sizeof(ItemIdData) - \ sizeof(ItemIdData) - \
MAXALIGN(sizeof(HashPageOpaqueData))) MAXALIGN(sizeof(HashPageOpaqueData)))
#define INDEX_MOVED_BY_SPLIT_MASK 0x2000
#define HASH_MIN_FILLFACTOR 10 #define HASH_MIN_FILLFACTOR 10
#define HASH_DEFAULT_FILLFACTOR 75 #define HASH_DEFAULT_FILLFACTOR 75
...@@ -223,9 +241,6 @@ typedef HashMetaPageData *HashMetaPage; ...@@ -223,9 +241,6 @@ typedef HashMetaPageData *HashMetaPage;
#define HASH_WRITE BUFFER_LOCK_EXCLUSIVE #define HASH_WRITE BUFFER_LOCK_EXCLUSIVE
#define HASH_NOLOCK (-1) #define HASH_NOLOCK (-1)
#define HASH_SHARE ShareLock
#define HASH_EXCLUSIVE ExclusiveLock
/* /*
* Strategy number. There's only one valid strategy for hashing: equality. * Strategy number. There's only one valid strategy for hashing: equality.
*/ */
...@@ -297,21 +312,21 @@ extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, ...@@ -297,21 +312,21 @@ extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf,
Size itemsize, IndexTuple itup); Size itemsize, IndexTuple itup);
/* hashovfl.c */ /* hashovfl.c */
extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf); extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin);
extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, Buffer wbuf,
BufferAccessStrategy bstrategy); bool wbuf_dirty, BufferAccessStrategy bstrategy);
extern void _hash_initbitmap(Relation rel, HashMetaPage metap, extern void _hash_initbitmap(Relation rel, HashMetaPage metap,
BlockNumber blkno, ForkNumber forkNum); BlockNumber blkno, ForkNumber forkNum);
extern void _hash_squeezebucket(Relation rel, extern void _hash_squeezebucket(Relation rel,
Bucket bucket, BlockNumber bucket_blkno, Bucket bucket, BlockNumber bucket_blkno,
Buffer bucket_buf,
BufferAccessStrategy bstrategy); BufferAccessStrategy bstrategy);
/* hashpage.c */ /* hashpage.c */
extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access);
extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access);
extern void _hash_droplock(Relation rel, BlockNumber whichlock, int access);
extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno,
int access, int flags); int access, int flags);
extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel,
BlockNumber blkno, int flags);
extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno); extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno,
ForkNumber forkNum); ForkNumber forkNum);
...@@ -320,6 +335,7 @@ extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, ...@@ -320,6 +335,7 @@ extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
BufferAccessStrategy bstrategy); BufferAccessStrategy bstrategy);
extern void _hash_relbuf(Relation rel, Buffer buf); extern void _hash_relbuf(Relation rel, Buffer buf);
extern void _hash_dropbuf(Relation rel, Buffer buf); extern void _hash_dropbuf(Relation rel, Buffer buf);
extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so);
extern void _hash_wrtbuf(Relation rel, Buffer buf); extern void _hash_wrtbuf(Relation rel, Buffer buf);
extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access, extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
int to_access); int to_access);
...@@ -327,12 +343,9 @@ extern uint32 _hash_metapinit(Relation rel, double num_tuples, ...@@ -327,12 +343,9 @@ extern uint32 _hash_metapinit(Relation rel, double num_tuples,
ForkNumber forkNum); ForkNumber forkNum);
extern void _hash_pageinit(Page page, Size size); extern void _hash_pageinit(Page page, Size size);
extern void _hash_expandtable(Relation rel, Buffer metabuf); extern void _hash_expandtable(Relation rel, Buffer metabuf);
extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf,
/* hashscan.c */ Bucket obucket, uint32 maxbucket, uint32 highmask,
extern void _hash_regscan(IndexScanDesc scan); uint32 lowmask);
extern void _hash_dropscan(IndexScanDesc scan);
extern bool _hash_has_active_scan(Relation rel, Bucket bucket);
extern void ReleaseResources_hash(void);
/* hashsearch.c */ /* hashsearch.c */
extern bool _hash_next(IndexScanDesc scan, ScanDirection dir); extern bool _hash_next(IndexScanDesc scan, ScanDirection dir);
...@@ -362,5 +375,18 @@ extern bool _hash_convert_tuple(Relation index, ...@@ -362,5 +375,18 @@ extern bool _hash_convert_tuple(Relation index,
Datum *index_values, bool *index_isnull); Datum *index_values, bool *index_isnull);
extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value); extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value); extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket);
extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket);
extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
uint32 lowmask, uint32 maxbucket);
/* hash.c */
extern void hashbucketcleanup(Relation rel, Bucket cur_bucket,
Buffer bucket_buf, BlockNumber bucket_blkno,
BufferAccessStrategy bstrategy,
uint32 maxbucket, uint32 highmask, uint32 lowmask,
double *tuples_removed, double *num_index_tuples,
bool bucket_has_garbage,
IndexBulkDeleteCallback callback, void *callback_state);
#endif /* HASH_H */ #endif /* HASH_H */
...@@ -63,7 +63,7 @@ typedef IndexAttributeBitMapData *IndexAttributeBitMap; ...@@ -63,7 +63,7 @@ typedef IndexAttributeBitMapData *IndexAttributeBitMap;
* t_info manipulation macros * t_info manipulation macros
*/ */
#define INDEX_SIZE_MASK 0x1FFF #define INDEX_SIZE_MASK 0x1FFF
/* bit 0x2000 is not used at present */ /* bit 0x2000 is reserved for index-AM specific usage */
#define INDEX_VAR_MASK 0x4000 #define INDEX_VAR_MASK 0x4000
#define INDEX_NULL_MASK 0x8000 #define INDEX_NULL_MASK 0x8000
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment