Commit 30df93f6 authored by Robert Haas's avatar Robert Haas

hash: Refactor overflow page allocation.

As with commit b0f18cb7, the goal
here is to move all of the related page modifications to a single
section of code, in preparation for adding write-ahead logging.

Amit Kapila, with slight changes by me.  The larger patch series
of which this is a part has been reviewed and tested by Álvaro
Herrera, Ashutosh Sharma, Mark Kirkwood, Jeff Janes, and Jesper
Pedersen, all of whom should also have been credited in the
previous commit message.
parent b0f18cb7
......@@ -21,7 +21,6 @@
#include "utils/rel.h"
static Buffer _hash_getovflpage(Relation rel, Buffer metabuf);
static uint32 _hash_firstfreebit(uint32 map);
......@@ -113,13 +112,30 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
Page ovflpage;
HashPageOpaque pageopaque;
HashPageOpaque ovflopaque;
/* allocate and lock an empty overflow page */
ovflbuf = _hash_getovflpage(rel, metabuf);
HashMetaPage metap;
Buffer mapbuf = InvalidBuffer;
Buffer newmapbuf = InvalidBuffer;
BlockNumber blkno;
uint32 orig_firstfree;
uint32 splitnum;
uint32 *freep = NULL;
uint32 max_ovflpg;
uint32 bit;
uint32 bitmap_page_bit;
uint32 first_page;
uint32 last_bit;
uint32 last_page;
uint32 i,
j;
bool page_found = false;
/*
* Write-lock the tail page. It is okay to hold two buffer locks here
* since there cannot be anyone else contending for access to ovflbuf.
* Write-lock the tail page. Here, we need to maintain locking order such
* that, first acquire the lock on tail page of bucket, then on meta page
* to find and lock the bitmap page and if it is found, then lock on meta
* page is released, then finally acquire the lock on new overflow buffer.
* We need this locking order to avoid deadlock with backends that are
* doing inserts.
*/
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
......@@ -153,60 +169,6 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
}
/* now that we have correct backlink, initialize new overflow page */
ovflpage = BufferGetPage(ovflbuf);
ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
ovflopaque->hasho_nextblkno = InvalidBlockNumber;
ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
ovflopaque->hasho_page_id = HASHO_PAGE_ID;
MarkBufferDirty(ovflbuf);
/* logically chain overflow page to previous page */
pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
MarkBufferDirty(buf);
if (retain_pin)
{
/* pin will be retained only for the primary bucket page */
Assert(pageopaque->hasho_flag & LH_BUCKET_PAGE);
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
else
_hash_relbuf(rel, buf);
return ovflbuf;
}
/*
* _hash_getovflpage()
*
* Find an available overflow page and return it. The returned buffer
* is pinned and write-locked, and has had _hash_pageinit() applied,
* but it is caller's responsibility to fill the special space.
*
* The caller must hold a pin, but no lock, on the metapage buffer.
* That buffer is left in the same state at exit.
*/
static Buffer
_hash_getovflpage(Relation rel, Buffer metabuf)
{
HashMetaPage metap;
Buffer mapbuf = 0;
Buffer newbuf;
BlockNumber blkno;
uint32 orig_firstfree;
uint32 splitnum;
uint32 *freep = NULL;
uint32 max_ovflpg;
uint32 bit;
uint32 first_page;
uint32 last_bit;
uint32 last_page;
uint32 i,
j;
/* Get exclusive lock on the meta page */
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
......@@ -255,11 +217,31 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
for (; bit <= last_inpage; j++, bit += BITS_PER_MAP)
{
if (freep[j] != ALL_SET)
{
page_found = true;
/* Reacquire exclusive lock on the meta page */
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
/* convert bit to bit number within page */
bit += _hash_firstfreebit(freep[j]);
bitmap_page_bit = bit;
/* convert bit to absolute bit number */
bit += (i << BMPG_SHIFT(metap));
/* Calculate address of the recycled overflow page */
blkno = bitno_to_blkno(metap, bit);
/* Fetch and init the recycled page */
ovflbuf = _hash_getinitbuf(rel, blkno);
goto found;
}
}
/* No free space here, try to advance to next map page */
_hash_relbuf(rel, mapbuf);
mapbuf = InvalidBuffer;
i++;
j = 0; /* scan from start of next map page */
bit = 0;
......@@ -283,8 +265,15 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
* convenient to pre-mark them as "in use" too.
*/
bit = metap->hashm_spares[splitnum];
_hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
metap->hashm_spares[splitnum]++;
/* metapage already has a write lock */
if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("out of overflow pages in hash index \"%s\"",
RelationGetRelationName(rel))));
newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
}
else
{
......@@ -295,7 +284,8 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
}
/* Calculate address of the new overflow page */
bit = metap->hashm_spares[splitnum];
bit = BufferIsValid(newmapbuf) ?
metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum];
blkno = bitno_to_blkno(metap, bit);
/*
......@@ -303,41 +293,48 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
* relation length stays in sync with ours. XXX It's annoying to do this
* with metapage write lock held; would be better to use a lock that
* doesn't block incoming searches.
*
* It is okay to hold two buffer locks here (one on tail page of bucket
* and other on new overflow page) since there cannot be anyone else
* contending for access to ovflbuf.
*/
newbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);
ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);
metap->hashm_spares[splitnum]++;
found:
/*
* Adjust hashm_firstfree to avoid redundant searches. But don't risk
* changing it if someone moved it while we were searching bitmap pages.
* Do the update.
*/
if (metap->hashm_firstfree == orig_firstfree)
metap->hashm_firstfree = bit + 1;
/* Write updated metapage and release lock, but not pin */
MarkBufferDirty(metabuf);
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
return newbuf;
found:
/* convert bit to bit number within page */
bit += _hash_firstfreebit(freep[j]);
/* mark page "in use" in the bitmap */
SETBIT(freep, bit);
MarkBufferDirty(mapbuf);
_hash_relbuf(rel, mapbuf);
if (page_found)
{
Assert(BufferIsValid(mapbuf));
/* Reacquire exclusive lock on the meta page */
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
/* mark page "in use" in the bitmap */
SETBIT(freep, bitmap_page_bit);
MarkBufferDirty(mapbuf);
}
else
{
/* update the count to indicate new overflow page is added */
metap->hashm_spares[splitnum]++;
/* convert bit to absolute bit number */
bit += (i << BMPG_SHIFT(metap));
if (BufferIsValid(newmapbuf))
{
_hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false);
MarkBufferDirty(newmapbuf);
/* add the new bitmap page to the metapage's list of bitmaps */
metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf);
metap->hashm_nmaps++;
metap->hashm_spares[splitnum]++;
MarkBufferDirty(metabuf);
}
/* Calculate address of the recycled overflow page */
blkno = bitno_to_blkno(metap, bit);
/*
* for new overflow page, we don't need to explicitly set the bit in
* bitmap page, as by default that will be set to "in use".
*/
}
/*
* Adjust hashm_firstfree to avoid redundant searches. But don't risk
......@@ -346,19 +343,39 @@ found:
if (metap->hashm_firstfree == orig_firstfree)
{
metap->hashm_firstfree = bit + 1;
/* Write updated metapage and release lock, but not pin */
MarkBufferDirty(metabuf);
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
}
/* initialize new overflow page */
ovflpage = BufferGetPage(ovflbuf);
ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
ovflopaque->hasho_nextblkno = InvalidBlockNumber;
ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
ovflopaque->hasho_page_id = HASHO_PAGE_ID;
MarkBufferDirty(ovflbuf);
/* logically chain overflow page to previous page */
pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
MarkBufferDirty(buf);
if (retain_pin)
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
else
{
/* We didn't change the metapage, so no need to write */
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
}
_hash_relbuf(rel, buf);
if (BufferIsValid(mapbuf))
_hash_relbuf(rel, mapbuf);
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
if (BufferIsValid(newmapbuf))
_hash_relbuf(rel, newmapbuf);
/* Fetch, init, and return the recycled page */
return _hash_getinitbuf(rel, blkno);
return ovflbuf;
}
/*
......@@ -615,6 +632,42 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno,
}
/*
* _hash_initbitmapbuffer()
*
* Initialize a new bitmap page. All bits in the new bitmap page are set to
* "1", indicating "in use".
*/
void
_hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage)
{
Page pg;
HashPageOpaque op;
uint32 *freep;
pg = BufferGetPage(buf);
/* initialize the page */
if (initpage)
_hash_pageinit(pg, BufferGetPageSize(buf));
/* initialize the page's special space */
op = (HashPageOpaque) PageGetSpecialPointer(pg);
op->hasho_prevblkno = InvalidBlockNumber;
op->hasho_nextblkno = InvalidBlockNumber;
op->hasho_bucket = -1;
op->hasho_flag = LH_BITMAP_PAGE;
op->hasho_page_id = HASHO_PAGE_ID;
/* set all of the bits to 1 */
freep = HashPageGetBitmap(pg);
MemSet(freep, 0xFF, bmsize);
/* Set pd_lower just past the end of the bitmap page data. */
((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg;
}
/*
* _hash_squeezebucket(rel, bucket)
*
......
......@@ -313,6 +313,7 @@ extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovf
Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy);
extern void _hash_initbitmap(Relation rel, HashMetaPage metap,
BlockNumber blkno, ForkNumber forkNum);
extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage);
extern void _hash_squeezebucket(Relation rel,
Bucket bucket, BlockNumber bucket_blkno,
Buffer bucket_buf,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment