Commit 21a3cf41 authored by Robert Haas's avatar Robert Haas

hash: Refactor and clean up bucket split code.

As with commit 30df93f6 and commit
b0f18cb7, the goal here is to move all
of the related page modifications to a single section of code, in
preparation for adding write-ahead logging.

Amit Kapila, with slight changes by me.  The larger patch series of
which this is a part has been reviewed and tested by Álvaro Herrera,
Ashutosh Sharma, Mark Kirkwood, Jeff Janes, and Jesper Pedersen.
parent 123ccbe5
...@@ -40,12 +40,9 @@ static void _hash_splitbucket(Relation rel, Buffer metabuf, ...@@ -40,12 +40,9 @@ static void _hash_splitbucket(Relation rel, Buffer metabuf,
Bucket obucket, Bucket nbucket, Bucket obucket, Bucket nbucket,
Buffer obuf, Buffer obuf,
Buffer nbuf, Buffer nbuf,
HTAB *htab,
uint32 maxbucket, uint32 maxbucket,
uint32 highmask, uint32 lowmask); uint32 highmask, uint32 lowmask);
static void _hash_splitbucket_guts(Relation rel, Buffer metabuf,
Bucket obucket, Bucket nbucket, Buffer obuf,
Buffer nbuf, HTAB *htab, uint32 maxbucket,
uint32 highmask, uint32 lowmask);
/* /*
...@@ -497,7 +494,9 @@ _hash_expandtable(Relation rel, Buffer metabuf) ...@@ -497,7 +494,9 @@ _hash_expandtable(Relation rel, Buffer metabuf)
Buffer buf_nblkno; Buffer buf_nblkno;
Buffer buf_oblkno; Buffer buf_oblkno;
Page opage; Page opage;
Page npage;
HashPageOpaque oopaque; HashPageOpaque oopaque;
HashPageOpaque nopaque;
uint32 maxbucket; uint32 maxbucket;
uint32 highmask; uint32 highmask;
uint32 lowmask; uint32 lowmask;
...@@ -685,18 +684,18 @@ restart_expand: ...@@ -685,18 +684,18 @@ restart_expand:
goto fail; goto fail;
} }
/* /*
* Okay to proceed with split. Update the metapage bucket mapping info. * Since we are scribbling on the pages in the shared buffers, establish a
* * critical section. Any failure in this next code leaves us with a big
* Since we are scribbling on the metapage data right in the shared
* buffer, any failure in this next little bit leaves us with a big
* problem: the metapage is effectively corrupt but could get written back * problem: the metapage is effectively corrupt but could get written back
* to disk. We don't really expect any failure, but just to be sure, * to disk. We don't really expect any failure, but just to be sure,
* establish a critical section. * establish a critical section.
*/ */
START_CRIT_SECTION(); START_CRIT_SECTION();
/*
* Okay to proceed with split. Update the metapage bucket mapping info.
*/
metap->hashm_maxbucket = new_bucket; metap->hashm_maxbucket = new_bucket;
if (new_bucket > metap->hashm_highmask) if (new_bucket > metap->hashm_highmask)
...@@ -718,8 +717,7 @@ restart_expand: ...@@ -718,8 +717,7 @@ restart_expand:
metap->hashm_ovflpoint = spare_ndx; metap->hashm_ovflpoint = spare_ndx;
} }
/* Done mucking with metapage */ MarkBufferDirty(metabuf);
END_CRIT_SECTION();
/* /*
* Copy bucket mapping info now; this saves re-accessing the meta page * Copy bucket mapping info now; this saves re-accessing the meta page
...@@ -732,16 +730,51 @@ restart_expand: ...@@ -732,16 +730,51 @@ restart_expand:
highmask = metap->hashm_highmask; highmask = metap->hashm_highmask;
lowmask = metap->hashm_lowmask; lowmask = metap->hashm_lowmask;
/* Write out the metapage and drop lock, but keep pin */ opage = BufferGetPage(buf_oblkno);
MarkBufferDirty(metabuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
/*
* Mark the old bucket to indicate that split is in progress. (At
* operation end, we will clear the split-in-progress flag.) Also,
* for a primary bucket page, hasho_prevblkno stores the number of
* buckets that existed as of the last split, so we must update that
* value here.
*/
oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
oopaque->hasho_prevblkno = maxbucket;
MarkBufferDirty(buf_oblkno);
npage = BufferGetPage(buf_nblkno);
/*
* initialize the new bucket's primary page and mark it to indicate that
* split is in progress.
*/
nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
nopaque->hasho_prevblkno = maxbucket;
nopaque->hasho_nextblkno = InvalidBlockNumber;
nopaque->hasho_bucket = new_bucket;
nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED;
nopaque->hasho_page_id = HASHO_PAGE_ID;
MarkBufferDirty(buf_nblkno);
END_CRIT_SECTION();
/* drop lock, but keep pin */
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
/* Relocate records to the new bucket */ /* Relocate records to the new bucket */
_hash_splitbucket(rel, metabuf, _hash_splitbucket(rel, metabuf,
old_bucket, new_bucket, old_bucket, new_bucket,
buf_oblkno, buf_nblkno, buf_oblkno, buf_nblkno, NULL,
maxbucket, highmask, lowmask); maxbucket, highmask, lowmask);
/* all done, now release the locks and pins on primary buckets. */
_hash_relbuf(rel, buf_oblkno);
_hash_relbuf(rel, buf_nblkno);
return; return;
/* Here if decide not to split or fail to acquire old bucket lock */ /* Here if decide not to split or fail to acquire old bucket lock */
...@@ -803,10 +836,16 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) ...@@ -803,10 +836,16 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
/* /*
* _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
* *
* This routine is used to partition the tuples between old and new bucket and
* is used to finish the incomplete split operations. To finish the previously
* interrupted split operation, the caller needs to fill htab. If htab is set,
* then we skip the movement of tuples that exists in htab, otherwise NULL
* value of htab indicates movement of all the tuples that belong to the new
* bucket.
*
* We are splitting a bucket that consists of a base bucket page and zero * We are splitting a bucket that consists of a base bucket page and zero
* or more overflow (bucket chain) pages. We must relocate tuples that * or more overflow (bucket chain) pages. We must relocate tuples that
* belong in the new bucket, and compress out any free space in the old * belong in the new bucket.
* bucket.
* *
* The caller must hold cleanup locks on both buckets to ensure that * The caller must hold cleanup locks on both buckets to ensure that
* no one else is trying to access them (see README). * no one else is trying to access them (see README).
...@@ -827,68 +866,6 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) ...@@ -827,68 +866,6 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
*/ */
static void static void
_hash_splitbucket(Relation rel, _hash_splitbucket(Relation rel,
Buffer metabuf,
Bucket obucket,
Bucket nbucket,
Buffer obuf,
Buffer nbuf,
uint32 maxbucket,
uint32 highmask,
uint32 lowmask)
{
Page opage;
Page npage;
HashPageOpaque oopaque;
HashPageOpaque nopaque;
opage = BufferGetPage(obuf);
oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
/*
* Mark the old bucket to indicate that split is in progress. (At
* operation end, we will clear the split-in-progress flag.) Also,
* for a primary bucket page, hasho_prevblkno stores the number of
* buckets that existed as of the last split, so we must update that
* value here.
*/
oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
oopaque->hasho_prevblkno = maxbucket;
npage = BufferGetPage(nbuf);
/*
* initialize the new bucket's primary page and mark it to indicate that
* split is in progress.
*/
nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
nopaque->hasho_prevblkno = maxbucket;
nopaque->hasho_nextblkno = InvalidBlockNumber;
nopaque->hasho_bucket = nbucket;
nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED;
nopaque->hasho_page_id = HASHO_PAGE_ID;
_hash_splitbucket_guts(rel, metabuf, obucket,
nbucket, obuf, nbuf, NULL,
maxbucket, highmask, lowmask);
/* all done, now release the locks and pins on primary buckets. */
_hash_relbuf(rel, obuf);
_hash_relbuf(rel, nbuf);
}
/*
* _hash_splitbucket_guts -- Helper function to perform the split operation
*
* This routine is used to partition the tuples between old and new bucket and
* to finish incomplete split operations. To finish the previously
* interrupted split operation, caller needs to fill htab. If htab is set, then
* we skip the movement of tuples that exists in htab, otherwise NULL value of
* htab indicates movement of all the tuples that belong to new bucket.
*
* Caller needs to lock and unlock the old and new primary buckets.
*/
static void
_hash_splitbucket_guts(Relation rel,
Buffer metabuf, Buffer metabuf,
Bucket obucket, Bucket obucket,
Bucket nbucket, Bucket nbucket,
...@@ -987,6 +964,7 @@ _hash_splitbucket_guts(Relation rel, ...@@ -987,6 +964,7 @@ _hash_splitbucket_guts(Relation rel,
{ {
/* write out nbuf and drop lock, but keep pin */ /* write out nbuf and drop lock, but keep pin */
MarkBufferDirty(nbuf); MarkBufferDirty(nbuf);
/* drop lock, but keep pin */
LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
/* chain to a new overflow page */ /* chain to a new overflow page */
nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false); nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false);
...@@ -1025,7 +1003,14 @@ _hash_splitbucket_guts(Relation rel, ...@@ -1025,7 +1003,14 @@ _hash_splitbucket_guts(Relation rel,
/* Exit loop if no more overflow pages in old bucket */ /* Exit loop if no more overflow pages in old bucket */
if (!BlockNumberIsValid(oblkno)) if (!BlockNumberIsValid(oblkno))
{
MarkBufferDirty(nbuf);
if (nbuf == bucket_nbuf)
LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
else
_hash_relbuf(rel, nbuf);
break; break;
}
/* Else, advance to next old page */ /* Else, advance to next old page */
obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE); obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE);
...@@ -1041,17 +1026,6 @@ _hash_splitbucket_guts(Relation rel, ...@@ -1041,17 +1026,6 @@ _hash_splitbucket_guts(Relation rel,
* To avoid deadlocks due to locking order of buckets, first lock the old * To avoid deadlocks due to locking order of buckets, first lock the old
* bucket and then the new bucket. * bucket and then the new bucket.
*/ */
if (nbuf == bucket_nbuf)
{
MarkBufferDirty(bucket_nbuf);
LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
}
else
{
MarkBufferDirty(nbuf);
_hash_relbuf(rel, nbuf);
}
LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE); LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE);
opage = BufferGetPage(bucket_obuf); opage = BufferGetPage(bucket_obuf);
oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
...@@ -1192,7 +1166,7 @@ _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, ...@@ -1192,7 +1166,7 @@ _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage); npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
nbucket = npageopaque->hasho_bucket; nbucket = npageopaque->hasho_bucket;
_hash_splitbucket_guts(rel, metabuf, obucket, _hash_splitbucket(rel, metabuf, obucket,
nbucket, obuf, bucket_nbuf, tidhtab, nbucket, obuf, bucket_nbuf, tidhtab,
maxbucket, highmask, lowmask); maxbucket, highmask, lowmask);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment