Commit 7a369371 authored by Tom Lane's avatar Tom Lane

Reimplement hash index locking algorithms, per my recent proposal to

pghackers.  This fixes the problem recently reported by Markus KrÌutner
(hash bucket split corrupts the state of scans being done concurrently),
and I believe it also fixes all the known problems with deadlocks in
hash index operations.  Hash indexes are still not really ready for prime
time (since they aren't WAL-logged), but this is a step forward.
parent ca43f71c
$Header: /cvsroot/pgsql/src/backend/access/hash/README,v 1.2 2003/09/02 03:29:01 tgl Exp $
$Header: /cvsroot/pgsql/src/backend/access/hash/README,v 1.3 2003/09/04 22:06:27 tgl Exp $
This directory contains an implementation of hash indexing for Postgres.
......@@ -229,8 +229,8 @@ existing bucket in two, thereby lowering the fill ratio:
check split still needed
if split not needed anymore, drop locks and exit
decide which bucket to split
Attempt to X-lock new bucket number (shouldn't fail, but...)
Attempt to X-lock old bucket number (definitely could fail)
Attempt to X-lock new bucket number (shouldn't fail, but...)
if above fail, drop locks and exit
update meta page to reflect new number of buckets
write/release meta page
......@@ -261,12 +261,6 @@ not be overfull and split attempts will stop. (We could make a successful
splitter loop to see if the index is still overfull, but it seems better to
distribute the split overhead across successive insertions.)
It may be wise to make the initial exclusive-lock-page-zero operation a
conditional one as well, although the odds of a deadlock failure are quite
low. (AFAICS it could only deadlock against a VACUUM operation that is
trying to X-lock a bucket that the current process has a stopped indexscan
in.)
A problem is that if a split fails partway through (eg due to insufficient
disk space) the index is left corrupt. The probability of that could be
made quite low if we grab a free page or two before we update the meta
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -8,22 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashscan.c,v 1.30 2003/08/04 02:39:57 momjian Exp $
*
* NOTES
* Because we can be doing an index scan on a relation while we
* update it, we need to avoid missing data that moves around in
* the index. The routines and global variables in this file
* guarantee that all scans in the local address space stay
* correctly positioned. This is all we need to worry about, since
* write locking guarantees that no one else will be on the same
* page at the same time as we are.
*
* The scheme is to manage a list of active scans in the current
* backend. Whenever we add or remove records from an index, we
* check the list of active scans to see if any has been affected.
* A scan is affected only if it is on the same relation, and the
* same page, as the update.
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashscan.c,v 1.31 2003/09/04 22:06:27 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -44,10 +29,6 @@ typedef HashScanListData *HashScanList;
static HashScanList HashScans = (HashScanList) NULL;
static void _hash_scandel(IndexScanDesc scan,
BlockNumber blkno, OffsetNumber offno);
/*
* AtEOXact_hash() --- clean up hash subsystem at xact abort or commit.
*
......@@ -67,9 +48,6 @@ AtEOXact_hash(void)
* at end of transaction anyway.
*/
HashScans = NULL;
/* If we were building a hash, we ain't anymore. */
BuildingHash = false;
}
/*
......@@ -112,70 +90,26 @@ _hash_dropscan(IndexScanDesc scan)
pfree(chk);
}
void
_hash_adjscans(Relation rel, ItemPointer tid)
/*
* Is there an active scan in this bucket?
*/
bool
_hash_has_active_scan(Relation rel, Bucket bucket)
{
Oid relid = RelationGetRelid(rel);
HashScanList l;
Oid relid;
relid = RelationGetRelid(rel);
for (l = HashScans; l != (HashScanList) NULL; l = l->hashsl_next)
for (l = HashScans; l != NULL; l = l->hashsl_next)
{
if (relid == l->hashsl_scan->indexRelation->rd_id)
_hash_scandel(l->hashsl_scan, ItemPointerGetBlockNumber(tid),
ItemPointerGetOffsetNumber(tid));
}
}
{
HashScanOpaque so = (HashScanOpaque) l->hashsl_scan->opaque;
static void
_hash_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
{
ItemPointer current;
ItemPointer mark;
Buffer buf;
Buffer metabuf;
HashScanOpaque so;
so = (HashScanOpaque) scan->opaque;
current = &(scan->currentItemData);
mark = &(scan->currentMarkData);
if (ItemPointerIsValid(current)
&& ItemPointerGetBlockNumber(current) == blkno
&& ItemPointerGetOffsetNumber(current) >= offno)
{
metabuf = _hash_getbuf(scan->indexRelation, HASH_METAPAGE, HASH_READ);
buf = so->hashso_curbuf;
_hash_step(scan, &buf, BackwardScanDirection, metabuf);
if (so->hashso_bucket_valid &&
so->hashso_bucket == bucket)
return true;
}
}
if (ItemPointerIsValid(mark)
&& ItemPointerGetBlockNumber(mark) == blkno
&& ItemPointerGetOffsetNumber(mark) >= offno)
{
/*
* The idea here is to exchange the current and mark positions,
* then step backwards (affecting current), then exchange again.
*/
ItemPointerData tmpitem;
Buffer tmpbuf;
tmpitem = *mark;
*mark = *current;
*current = tmpitem;
tmpbuf = so->hashso_mrkbuf;
so->hashso_mrkbuf = so->hashso_curbuf;
so->hashso_curbuf = tmpbuf;
metabuf = _hash_getbuf(scan->indexRelation, HASH_METAPAGE, HASH_READ);
buf = so->hashso_curbuf;
_hash_step(scan, &buf, BackwardScanDirection, metabuf);
tmpitem = *mark;
*mark = *current;
*current = tmpitem;
tmpbuf = so->hashso_mrkbuf;
so->hashso_mrkbuf = so->hashso_curbuf;
so->hashso_curbuf = tmpbuf;
}
return false;
}
This diff is collapsed.
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashutil.c,v 1.35 2003/09/02 18:13:31 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashutil.c,v 1.36 2003/09/04 22:06:27 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -19,46 +19,6 @@
#include "access/iqual.h"
/*
* _hash_mkscankey -- build a scan key matching the given indextuple
*
* Note: this is prepared for multiple index columns, but very little
* else in access/hash is ...
*/
ScanKey
_hash_mkscankey(Relation rel, IndexTuple itup)
{
ScanKey skey;
TupleDesc itupdesc = RelationGetDescr(rel);
int natts = rel->rd_rel->relnatts;
AttrNumber i;
Datum arg;
FmgrInfo *procinfo;
bool isnull;
skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
for (i = 0; i < natts; i++)
{
arg = index_getattr(itup, i + 1, itupdesc, &isnull);
procinfo = index_getprocinfo(rel, i + 1, HASHPROC);
ScanKeyEntryInitializeWithInfo(&skey[i],
isnull ? SK_ISNULL : 0x0,
(AttrNumber) (i + 1),
procinfo,
CurrentMemoryContext,
arg);
}
return skey;
}
void
_hash_freeskey(ScanKey skey)
{
pfree(skey);
}
/*
* _hash_checkqual -- does the index tuple satisfy the scan conditions?
*/
......@@ -102,24 +62,31 @@ _hash_formitem(IndexTuple itup)
}
/*
* _hash_call -- given a Datum, call the index's hash procedure
*
* Returns the bucket number that the hash key maps to.
* _hash_datum2hashkey -- given a Datum, call the index's hash procedure
*/
Bucket
_hash_call(Relation rel, HashMetaPage metap, Datum key)
uint32
_hash_datum2hashkey(Relation rel, Datum key)
{
FmgrInfo *procinfo;
uint32 n;
Bucket bucket;
/* XXX assumes index has only one attribute */
procinfo = index_getprocinfo(rel, 1, HASHPROC);
n = DatumGetUInt32(FunctionCall1(procinfo, key));
bucket = n & metap->hashm_highmask;
if (bucket > metap->hashm_maxbucket)
bucket = bucket & metap->hashm_lowmask;
return DatumGetUInt32(FunctionCall1(procinfo, key));
}
/*
* _hash_hashkey2bucket -- determine which bucket the hashkey maps to.
*/
Bucket
_hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
uint32 highmask, uint32 lowmask)
{
Bucket bucket;
bucket = hashkey & highmask;
if (bucket > maxbucket)
bucket = bucket & lowmask;
return bucket;
}
......
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lmgr.c,v 1.59 2003/08/17 22:41:12 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lmgr.c,v 1.60 2003/09/04 22:06:27 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -153,7 +153,7 @@ LockRelation(Relation relation, LOCKMODE lockmode)
* As above, but only lock if we can get the lock without blocking.
* Returns TRUE iff the lock was acquired.
*
* NOTE: we do not currently need conditional versions of the other
* NOTE: we do not currently need conditional versions of all the
* LockXXX routines in this file, but they could easily be added if needed.
*/
bool
......@@ -264,6 +264,26 @@ LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
elog(ERROR, "LockAcquire failed");
}
/*
* ConditionalLockPage
*
* As above, but only lock if we can get the lock without blocking.
* Returns TRUE iff the lock was acquired.
*/
bool
ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
{
LOCKTAG tag;
MemSet(&tag, 0, sizeof(tag));
tag.relId = relation->rd_lockInfo.lockRelId.relId;
tag.dbId = relation->rd_lockInfo.lockRelId.dbId;
tag.objId.blkno = blkno;
return LockAcquire(LockTableId, &tag, GetCurrentTransactionId(),
lockmode, true);
}
/*
* UnlockPage
*/
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: hash.h,v 1.52 2003/09/02 18:13:32 tgl Exp $
* $Id: hash.h,v 1.53 2003/09/04 22:06:27 tgl Exp $
*
* NOTES
* modeled after Margo Seltzer's hash implementation for unix.
......@@ -70,13 +70,27 @@ typedef HashPageOpaqueData *HashPageOpaque;
#define HASHO_FILL 0x1234
/*
* ScanOpaqueData is used to remember which buffers we're currently
* examining in the scan. We keep these buffers locked and pinned and
* recorded in the opaque entry of the scan in order to avoid doing a
* ReadBuffer() for every tuple in the index.
* HashScanOpaqueData is private state for a hash index scan.
*/
typedef struct HashScanOpaqueData
{
/*
* By definition, a hash scan should be examining only one bucket.
* We record the bucket number here as soon as it is known.
*/
Bucket hashso_bucket;
bool hashso_bucket_valid;
/*
* If we have a share lock on the bucket, we record it here. When
* hashso_bucket_blkno is zero, we have no such lock.
*/
BlockNumber hashso_bucket_blkno;
/*
* We also want to remember which buffers we're currently examining in the
* scan. We keep these buffers pinned (but not locked) across hashgettuple
* calls, in order to avoid doing a ReadBuffer() for every tuple in the
* index.
*/
Buffer hashso_curbuf;
Buffer hashso_mrkbuf;
} HashScanOpaqueData;
......@@ -148,10 +162,18 @@ typedef struct HashItemData
typedef HashItemData *HashItem;
/*
* Maximum size of a hash index item (it's okay to have only one per page)
*/
#define HashMaxItemSize(page) \
(PageGetPageSize(page) - \
sizeof(PageHeaderData) - \
MAXALIGN(sizeof(HashPageOpaqueData)) - \
sizeof(ItemIdData))
/*
* Constants
*/
#define DEFAULT_FFACTOR 300
#define BYTE_TO_BIT 3 /* 2^3 bits/byte */
#define ALL_SET ((uint32) ~0)
......@@ -180,10 +202,14 @@ typedef HashItemData *HashItem;
#define ISSET(A, N) ((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP)))
/*
* page locking modes
* page-level and high-level locking modes (see README)
*/
#define HASH_READ 0
#define HASH_WRITE 1
#define HASH_READ BUFFER_LOCK_SHARE
#define HASH_WRITE BUFFER_LOCK_EXCLUSIVE
#define HASH_NOLOCK (-1)
#define HASH_SHARE ShareLock
#define HASH_EXCLUSIVE ExclusiveLock
/*
* Strategy number. There's only one valid strategy for hashing: equality.
......@@ -199,8 +225,6 @@ typedef HashItemData *HashItem;
#define HASHPROC 1
extern bool BuildingHash;
/* public routines */
extern Datum hashbuild(PG_FUNCTION_ARGS);
......@@ -250,36 +274,37 @@ extern void _hash_squeezebucket(Relation rel,
Bucket bucket, BlockNumber bucket_blkno);
/* hashpage.c */
extern void _hash_metapinit(Relation rel);
extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access);
extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access);
extern void _hash_droplock(Relation rel, BlockNumber whichlock, int access);
extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access);
extern void _hash_relbuf(Relation rel, Buffer buf, int access);
extern void _hash_relbuf(Relation rel, Buffer buf);
extern void _hash_dropbuf(Relation rel, Buffer buf);
extern void _hash_wrtbuf(Relation rel, Buffer buf);
extern void _hash_wrtnorelbuf(Buffer buf);
extern void _hash_wrtnorelbuf(Relation rel, Buffer buf);
extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
int to_access);
extern void _hash_metapinit(Relation rel);
extern void _hash_pageinit(Page page, Size size);
extern void _hash_expandtable(Relation rel, Buffer metabuf);
/* hashscan.c */
extern void _hash_regscan(IndexScanDesc scan);
extern void _hash_dropscan(IndexScanDesc scan);
extern void _hash_adjscans(Relation rel, ItemPointer tid);
extern bool _hash_has_active_scan(Relation rel, Bucket bucket);
extern void AtEOXact_hash(void);
/* hashsearch.c */
extern void _hash_search(Relation rel, int keysz, ScanKey scankey,
Buffer *bufP, HashMetaPage metap);
extern bool _hash_next(IndexScanDesc scan, ScanDirection dir);
extern bool _hash_first(IndexScanDesc scan, ScanDirection dir);
extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir,
Buffer metabuf);
extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
/* hashutil.c */
extern ScanKey _hash_mkscankey(Relation rel, IndexTuple itup);
extern void _hash_freeskey(ScanKey skey);
extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup);
extern HashItem _hash_formitem(IndexTuple itup);
extern Bucket _hash_call(Relation rel, HashMetaPage metap, Datum key);
extern uint32 _hash_datum2hashkey(Relation rel, Datum key);
extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
uint32 highmask, uint32 lowmask);
extern uint32 _hash_log2(uint32 num);
extern void _hash_checkpage(Relation rel, Page page, int flags);
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: lmgr.h,v 1.39 2003/08/04 02:40:14 momjian Exp $
* $Id: lmgr.h,v 1.40 2003/09/04 22:06:27 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -54,8 +54,9 @@ extern void UnlockRelation(Relation relation, LOCKMODE lockmode);
extern void LockRelationForSession(LockRelId *relid, LOCKMODE lockmode);
extern void UnlockRelationForSession(LockRelId *relid, LOCKMODE lockmode);
/* Lock a page (mainly used for indices) */
/* Lock a page (mainly used for indexes) */
extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
extern void UnlockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
/* Lock an XID (used to wait for a transaction to finish) */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment