Commit 95a03e9c authored by Tom Lane's avatar Tom Lane

Another round of code cleanup on bufmgr. Use BM_VALID flag to keep track

of whether we have successfully read data into a buffer; this makes the
error behavior a bit more transparent (IMHO anyway), and also makes it
work correctly for local buffers which don't use Start/TerminateBufferIO.
Collapse three separate functions for writing a shared buffer into one.
This overlaps a bit with cleanups that Neil proposed awhile back, but
seems not to have committed yet.
parent aeee8565
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/buffer/buf_init.c,v 1.63 2004/04/19 23:27:17 tgl Exp $ * $PostgreSQL: pgsql/src/backend/storage/buffer/buf_init.c,v 1.64 2004/04/21 18:06:30 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -133,11 +133,11 @@ InitBufferPool(void) ...@@ -133,11 +133,11 @@ InitBufferPool(void)
buf->bufNext = i + 1; buf->bufNext = i + 1;
CLEAR_BUFFERTAG(&(buf->tag)); CLEAR_BUFFERTAG(buf->tag);
buf->buf_id = i; buf->buf_id = i;
buf->data = MAKE_OFFSET(block); buf->data = MAKE_OFFSET(block);
buf->flags = (BM_DELETED | BM_VALID); buf->flags = 0;
buf->refcount = 0; buf->refcount = 0;
buf->io_in_progress_lock = LWLockAssign(); buf->io_in_progress_lock = LWLockAssign();
buf->cntx_lock = LWLockAssign(); buf->cntx_lock = LWLockAssign();
......
...@@ -8,20 +8,14 @@ ...@@ -8,20 +8,14 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.161 2004/04/19 23:27:17 tgl Exp $ * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.162 2004/04/21 18:06:30 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
/* /*
* * ReadBuffer() -- find or create a buffer holding the requested page,
* BufferAlloc() -- lookup a buffer in the buffer table. If * and pin it so that no one can destroy it while this process
* it isn't there add it, but do not read data into memory. * is using it.
* This is used when we are about to reinitialize the
* buffer so don't care what the current disk contents are.
* BufferAlloc() also pins the new buffer in memory.
*
* ReadBuffer() -- like BufferAlloc() but reads the data
* on a buffer cache miss.
* *
* ReleaseBuffer() -- unpin the buffer * ReleaseBuffer() -- unpin the buffer
* *
...@@ -31,7 +25,7 @@ ...@@ -31,7 +25,7 @@
* *
* WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer() * WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer()
* *
* BufferSync() -- flush all dirty buffers in the buffer pool. * BufferSync() -- flush all (or some) dirty buffers in the buffer pool.
* *
* InitBufferPool() -- Init the buffer module. * InitBufferPool() -- Init the buffer module.
* *
...@@ -76,25 +70,19 @@ long NDirectFileRead; /* some I/O's are direct file access. ...@@ -76,25 +70,19 @@ long NDirectFileRead; /* some I/O's are direct file access.
* bypass bufmgr */ * bypass bufmgr */
long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */ long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */
/*
* Macro : BUFFER_IS_BROKEN
* Note that write error doesn't mean the buffer broken
*/
#define BUFFER_IS_BROKEN(buf) ((buf->flags & BM_IO_ERROR) && !(buf->flags & BM_DIRTY))
static void PinBuffer(BufferDesc *buf); static void PinBuffer(BufferDesc *buf);
static void UnpinBuffer(BufferDesc *buf); static void UnpinBuffer(BufferDesc *buf);
static void WaitIO(BufferDesc *buf); static void WaitIO(BufferDesc *buf);
static void StartBufferIO(BufferDesc *buf, bool forInput); static void StartBufferIO(BufferDesc *buf, bool forInput);
static void TerminateBufferIO(BufferDesc *buf); static void TerminateBufferIO(BufferDesc *buf, int err_flag);
static void ContinueBufferIO(BufferDesc *buf, bool forInput); static void ContinueBufferIO(BufferDesc *buf, bool forInput);
static void buffer_write_error_callback(void *arg); static void buffer_write_error_callback(void *arg);
static Buffer ReadBufferInternal(Relation reln, BlockNumber blockNum, static Buffer ReadBufferInternal(Relation reln, BlockNumber blockNum,
bool bufferLockHeld); bool bufferLockHeld);
static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
bool *foundPtr); bool *foundPtr);
static void BufferReplace(BufferDesc *bufHdr); static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
static void write_buffer(Buffer buffer, bool unpin); static void write_buffer(Buffer buffer, bool unpin);
...@@ -107,8 +95,8 @@ static void write_buffer(Buffer buffer, bool unpin); ...@@ -107,8 +95,8 @@ static void write_buffer(Buffer buffer, bool unpin);
* relation at the same time!) * relation at the same time!)
* *
* Returns: the buffer number for the buffer containing * Returns: the buffer number for the buffer containing
* the block read, or NULL on an error. If successful, * the block read. The returned buffer has been pinned.
* the returned buffer has been pinned. * Does not return on error --- elog's instead.
* *
* Assume when this function is called, that reln has been * Assume when this function is called, that reln has been
* opened already. * opened already.
...@@ -149,16 +137,11 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum, ...@@ -149,16 +137,11 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
pgstat_count_buffer_read(&reln->pgstat_info, reln); pgstat_count_buffer_read(&reln->pgstat_info, reln);
/* Substitute proper block number if caller asked for P_NEW */ /* Substitute proper block number if caller asked for P_NEW */
if (isExtend) if (isExtend)
{
blockNum = reln->rd_nblocks; blockNum = reln->rd_nblocks;
reln->rd_nblocks++;
}
bufHdr = LocalBufferAlloc(reln, blockNum, &found); bufHdr = LocalBufferAlloc(reln, blockNum, &found);
if (found) if (found)
{
LocalBufferHitCount++; LocalBufferHitCount++;
pgstat_count_buffer_hit(&reln->pgstat_info, reln);
}
} }
else else
{ {
...@@ -169,7 +152,6 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum, ...@@ -169,7 +152,6 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
{ {
/* must be sure we have accurate file length! */ /* must be sure we have accurate file length! */
blockNum = reln->rd_nblocks = smgrnblocks(reln->rd_smgr); blockNum = reln->rd_nblocks = smgrnblocks(reln->rd_smgr);
reln->rd_nblocks++;
} }
/* /*
...@@ -180,48 +162,44 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum, ...@@ -180,48 +162,44 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
bufHdr = BufferAlloc(reln, blockNum, &found); bufHdr = BufferAlloc(reln, blockNum, &found);
if (found) if (found)
{
BufferHitCount++; BufferHitCount++;
pgstat_count_buffer_hit(&reln->pgstat_info, reln);
}
} }
/* At this point we do NOT hold the bufmgr lock. */ /* At this point we do NOT hold the bufmgr lock. */
if (!bufHdr) /* if it was already in the buffer pool, we're done */
return InvalidBuffer;
/* if it's already in the buffer pool, we're done */
if (found) if (found)
{ {
/* That is, we're done if we expected to be able to find it ... */ /* Just need to update stats before we exit */
if (!isExtend) pgstat_count_buffer_hit(&reln->pgstat_info, reln);
return BufferDescriptorGetBuffer(bufHdr);
/* if (VacuumCostActive)
* If we found a buffer when we were expecting to extend the VacuumCostBalance += VacuumCostPageHit;
* relation, the implication is that a buffer was already created
* for the next page position, but then smgrextend failed to write return BufferDescriptorGetBuffer(bufHdr);
* the page. We'd better try the smgrextend again. But since
* BufferAlloc won't have done StartBufferIO, we must do that
* first.
*/
if (!isLocalBuf)
{
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
StartBufferIO(bufHdr, false);
LWLockRelease(BufMgrLock);
}
} }
/* /*
* if we have gotten to this point, the relation must be open in the smgr. * if we have gotten to this point, we have allocated a buffer for the
* page but its contents are not yet valid. IO_IN_PROGRESS is set for
* it, if it's a shared buffer.
*
* Note: if smgrextend fails, we will end up with a buffer that is
* allocated but not marked BM_VALID. P_NEW will still select the same
* block number (because the relation didn't get any longer on disk)
* and so future attempts to extend the relation will find the same
* buffer (if it's not been recycled) but come right back here to try
* smgrextend again.
*/ */
Assert(!(bufHdr->flags & BM_VALID));
if (isExtend) if (isExtend)
{ {
/* new buffers are zero-filled */ /* new buffers are zero-filled */
MemSet((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ); MemSet((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ);
smgrextend(reln->rd_smgr, blockNum, (char *) MAKE_PTR(bufHdr->data)); smgrextend(reln->rd_smgr, blockNum, (char *) MAKE_PTR(bufHdr->data));
/* After successful extend, increment relation length */
reln->rd_nblocks++;
} }
else else
{ {
...@@ -253,29 +231,41 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum, ...@@ -253,29 +231,41 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
if (isLocalBuf) if (isLocalBuf)
{ {
/* No shared buffer state to update... */ /* Only need to adjust flags */
return BufferDescriptorGetBuffer(bufHdr); bufHdr->flags |= BM_VALID;
} }
else
{
/* lock buffer manager again to update IO IN PROGRESS */
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
/* lock buffer manager again to update IO IN PROGRESS */ /* IO Succeeded, so mark data valid */
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); bufHdr->flags |= BM_VALID;
/* IO Succeeded. clear the flags, finish buffer update */ /* If anyone was waiting for IO to complete, wake them up now */
bufHdr->flags &= ~(BM_IO_ERROR | BM_IO_IN_PROGRESS); TerminateBufferIO(bufHdr, 0);
/* If anyone was waiting for IO to complete, wake them up now */ LWLockRelease(BufMgrLock);
TerminateBufferIO(bufHdr); }
LWLockRelease(BufMgrLock); if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageMiss;
return BufferDescriptorGetBuffer(bufHdr); return BufferDescriptorGetBuffer(bufHdr);
} }
/* /*
* BufferAlloc -- Get a buffer from the buffer pool but don't * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
* read it. If successful, the returned buffer is pinned. * buffer. If no buffer exists already, selects a replacement
* victim and evicts the old page, but does NOT read in new page.
* *
* Returns: descriptor for buffer * The returned buffer is pinned and is already marked as holding the
* desired page. If it already did have the desired page, *foundPtr is
* set TRUE. Otherwise, *foundPtr is set FALSE and the buffer is marked
* as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
*
* *foundPtr is actually redundant with the buffer's BM_VALID flag, but
* we keep it for simplicity in ReadBuffer.
* *
* BufMgrLock must be held at entry. When this routine returns, * BufMgrLock must be held at entry. When this routine returns,
* the BufMgrLock is guaranteed NOT to be held. * the BufMgrLock is guaranteed NOT to be held.
...@@ -285,67 +275,51 @@ BufferAlloc(Relation reln, ...@@ -285,67 +275,51 @@ BufferAlloc(Relation reln,
BlockNumber blockNum, BlockNumber blockNum,
bool *foundPtr) bool *foundPtr)
{ {
BufferTag newTag; /* identity of requested block */
BufferDesc *buf, BufferDesc *buf,
*buf2; *buf2;
BufferTag newTag; /* identity of requested block */
int cdb_found_index, int cdb_found_index,
cdb_replace_index; cdb_replace_index;
bool inProgress; /* buffer undergoing IO */ bool inProgress; /* did we already do StartBufferIO? */
/* create a tag so we can lookup the buffer */ /* create a tag so we can lookup the buffer */
INIT_BUFFERTAG(&newTag, reln, blockNum); INIT_BUFFERTAG(newTag, reln, blockNum);
/* see if the block is in the buffer pool already */ /* see if the block is in the buffer pool already */
buf = StrategyBufferLookup(&newTag, false, &cdb_found_index); buf = StrategyBufferLookup(&newTag, false, &cdb_found_index);
if (buf != NULL) if (buf != NULL)
{ {
/* /*
* Found it. Now, (a) pin the buffer so no one steals it from the * Found it. Now, pin the buffer so no one can steal it from the
* buffer pool, (b) check IO_IN_PROGRESS, someone may be faulting * buffer pool, and check to see if someone else is still reading
* the buffer into the buffer pool. * data into the buffer. (Formerly, we'd always block here if
* IO_IN_PROGRESS is set, but there's no need to wait when someone
* is writing rather than reading.)
*/ */
*foundPtr = TRUE;
PinBuffer(buf); PinBuffer(buf);
inProgress = (buf->flags & BM_IO_IN_PROGRESS);
*foundPtr = TRUE; if (!(buf->flags & BM_VALID))
if (inProgress) /* confirm end of IO */
{
WaitIO(buf);
inProgress = (buf->flags & BM_IO_IN_PROGRESS);
}
if (BUFFER_IS_BROKEN(buf))
{ {
/* if (buf->flags & BM_IO_IN_PROGRESS)
* I couldn't understand the following old comment. If there's {
* no IO for the buffer and the buffer is BROKEN, it should be /* someone else is reading it, wait for them */
* read again. So start a new buffer IO here. WaitIO(buf);
* }
* wierd race condition: if (!(buf->flags & BM_VALID))
* {
* We were waiting for someone else to read the buffer. While we /*
* were waiting, the reader boof'd in some way, so the * If we get here, previous attempts to read the buffer
* contents of the buffer are still invalid. By saying that * must have failed ... but we shall bravely try again.
* we didn't find it, we can make the caller reinitialize the */
* buffer. If two processes are waiting for this block, both *foundPtr = FALSE;
* will read the block. The second one to finish may StartBufferIO(buf, true);
* overwrite any updates made by the first. (Assume higher }
* level synchronization prevents this from happening).
*
* This is never going to happen, don't worry about it.
*/
*foundPtr = FALSE;
StartBufferIO(buf, true);
} }
LWLockRelease(BufMgrLock); LWLockRelease(BufMgrLock);
/*
* Do the cost accounting for vacuum
*/
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageHit;
return buf; return buf;
} }
...@@ -354,10 +328,10 @@ BufferAlloc(Relation reln, ...@@ -354,10 +328,10 @@ BufferAlloc(Relation reln,
/* /*
* Didn't find it in the buffer pool. We'll have to initialize a new * Didn't find it in the buffer pool. We'll have to initialize a new
* buffer. First, grab one from the free list. If it's dirty, flush * buffer. First, grab one from the free list. If it's dirty, flush
* it to disk. Remember to unlock BufMgrLock while doing the IOs. * it to disk. Remember to unlock BufMgrLock while doing the IO.
*/ */
inProgress = FALSE; inProgress = FALSE;
for (buf = NULL; buf == NULL;) do
{ {
buf = StrategyGetBuffer(&cdb_replace_index); buf = StrategyGetBuffer(&cdb_replace_index);
...@@ -374,130 +348,100 @@ BufferAlloc(Relation reln, ...@@ -374,130 +348,100 @@ BufferAlloc(Relation reln,
buf->refcount = 1; buf->refcount = 1;
PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 1; PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 1;
if (buf->flags & BM_DIRTY || buf->cntxDirty) if ((buf->flags & BM_VALID) &&
(buf->flags & BM_DIRTY || buf->cntxDirty))
{ {
/* /*
* skip write error buffers * Set BM_IO_IN_PROGRESS to show the buffer is being written.
* It cannot already be set because the buffer would be pinned
* if someone were writing it.
*
* Note: it's okay to grab the io_in_progress lock while holding
* BufMgrLock. All code paths that acquire this lock pin the
* buffer first; since no one had it pinned (it just came off the
* free list), no one else can have the lock.
*/ */
if ((buf->flags & BM_IO_ERROR) != 0) StartBufferIO(buf, false);
{
UnpinBuffer(buf);
buf = NULL;
continue;
}
/*
* Set BM_IO_IN_PROGRESS to keep anyone from doing anything
* with the contents of the buffer while we write it out. We
* don't really care if they try to read it, but if they can
* complete a BufferAlloc on it they can then scribble into
* it, and we'd really like to avoid that while we are
* flushing the buffer. Setting this flag should block them
* in WaitIO until we're done.
*/
inProgress = TRUE; inProgress = TRUE;
/*
* All code paths that acquire this lock pin the buffer first;
* since no one had it pinned (it just came off the free
* list), no one else can have this lock.
*/
StartBufferIO(buf, false);
/* /*
* Write the buffer out, being careful to release BufMgrLock * Write the buffer out, being careful to release BufMgrLock
* before starting the I/O. * while doing the I/O.
*/
BufferReplace(buf);
/*
* BM_JUST_DIRTIED cleared by BufferReplace and shouldn't
* be set by anyone. - vadim 01/17/97
*/
if (buf->flags & BM_JUST_DIRTIED)
{
elog(PANIC, "content of block %u of %u/%u changed while flushing",
buf->tag.blockNum,
buf->tag.rnode.tblNode, buf->tag.rnode.relNode);
}
buf->flags &= ~BM_DIRTY;
buf->cntxDirty = false;
/*
* Somebody could have pinned the buffer while we were doing
* the I/O and had given up the BufMgrLock (though they would
* be waiting for us to clear the BM_IO_IN_PROGRESS flag).
* That's why this is a loop -- if so, we need to clear the
* I/O flags, remove our pin and start all over again.
*
* People may be making buffers free at any time, so there's no
* reason to think that we have an immediate disaster on our
* hands.
*/ */
if (buf && buf->refcount > 1) FlushBuffer(buf, NULL);
{
inProgress = FALSE;
buf->flags &= ~BM_IO_IN_PROGRESS;
TerminateBufferIO(buf);
UnpinBuffer(buf);
buf = NULL;
}
/* /*
* Somebody could have allocated another buffer for the same * Somebody could have allocated another buffer for the same
* block we are about to read in. (While we flush out the * block we are about to read in. While we flush out the
* dirty buffer, we don't hold the lock and someone could have * dirty buffer, we don't hold the lock and someone could have
* allocated another buffer for the same block. The problem is * allocated another buffer for the same block. The problem is
* we haven't gotten around to insert the new tag into the * we haven't yet inserted the new tag into the buffer table.
* buffer table. So we need to check here. -ay 3/95 * So we need to check here. -ay 3/95
*
* Another reason we have to do this is to update cdb_found_index,
* since the CDB could have disappeared from B1/B2 list while
* we were writing.
*/ */
buf2 = StrategyBufferLookup(&newTag, true, &cdb_found_index); buf2 = StrategyBufferLookup(&newTag, true, &cdb_found_index);
if (buf2 != NULL) if (buf2 != NULL)
{ {
/* /*
* Found it. Someone has already done what we're about to * Found it. Someone has already done what we were about to
* do. We'll just handle this as if it were found in the * do. We'll just handle this as if it were found in the
* buffer pool in the first place. * buffer pool in the first place. First, give up the
* buffer we were planning to use.
*/ */
if (buf != NULL) TerminateBufferIO(buf, 0);
{ UnpinBuffer(buf);
buf->flags &= ~BM_IO_IN_PROGRESS;
TerminateBufferIO(buf);
/* give up old buffer since we don't need it any more */
UnpinBuffer(buf);
}
PinBuffer(buf2); buf = buf2;
inProgress = (buf2->flags & BM_IO_IN_PROGRESS);
/* remaining code should match code at top of routine */
*foundPtr = TRUE; *foundPtr = TRUE;
if (inProgress)
{
WaitIO(buf2);
inProgress = (buf2->flags & BM_IO_IN_PROGRESS);
}
if (BUFFER_IS_BROKEN(buf2)) PinBuffer(buf);
if (!(buf->flags & BM_VALID))
{ {
*foundPtr = FALSE; if (buf->flags & BM_IO_IN_PROGRESS)
StartBufferIO(buf2, true); {
/* someone else is reading it, wait for them */
WaitIO(buf);
}
if (!(buf->flags & BM_VALID))
{
/*
* If we get here, previous attempts to read the buffer
* must have failed ... but we shall bravely try again.
*/
*foundPtr = FALSE;
StartBufferIO(buf, true);
}
} }
LWLockRelease(BufMgrLock); LWLockRelease(BufMgrLock);
/* return buf;
* Do the cost accounting for vacuum. (XXX perhaps better }
* to consider this a miss? We didn't have to do the read,
* but we did have to write ...)
*/
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageHit;
return buf2; /*
* Somebody could have pinned the buffer while we were doing
* the I/O and had given up the BufMgrLock. If so, we can't
* recycle this buffer --- we need to clear the I/O flags,
* remove our pin and choose a new victim buffer. Similarly,
* we have to start over if somebody re-dirtied the buffer.
*/
if (buf->refcount > 1 || buf->flags & BM_DIRTY || buf->cntxDirty)
{
TerminateBufferIO(buf, 0);
UnpinBuffer(buf);
inProgress = FALSE;
buf = NULL;
} }
} }
} } while (buf == NULL);
/* /*
* At this point we should have the sole pin on a non-dirty buffer and * At this point we should have the sole pin on a non-dirty buffer and
...@@ -506,16 +450,19 @@ BufferAlloc(Relation reln, ...@@ -506,16 +450,19 @@ BufferAlloc(Relation reln,
/* /*
* Tell the buffer replacement strategy that we are replacing the * Tell the buffer replacement strategy that we are replacing the
* buffer content. Then rename the buffer. * buffer content. Then rename the buffer. Clearing BM_VALID here
* is necessary, clearing the dirtybits is just paranoia.
*/ */
StrategyReplaceBuffer(buf, &newTag, cdb_found_index, cdb_replace_index); StrategyReplaceBuffer(buf, &newTag, cdb_found_index, cdb_replace_index);
buf->tag = newTag; buf->tag = newTag;
buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
buf->cntxDirty = false;
/* /*
* Buffer contents are currently invalid. Have to mark IO IN PROGRESS * Buffer contents are currently invalid. Have to mark IO IN PROGRESS
* so no one fiddles with them until the read completes. If this * so no one fiddles with them until the read completes. We may have
* routine has been called simply to allocate a buffer, no io will be * already marked it, in which case we just flip from write to read
* attempted, so the flag isnt set. * status.
*/ */
if (!inProgress) if (!inProgress)
StartBufferIO(buf, true); StartBufferIO(buf, true);
...@@ -524,12 +471,6 @@ BufferAlloc(Relation reln, ...@@ -524,12 +471,6 @@ BufferAlloc(Relation reln,
LWLockRelease(BufMgrLock); LWLockRelease(BufMgrLock);
/*
* Do the cost accounting for vacuum
*/
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageMiss;
return buf; return buf;
} }
...@@ -553,11 +494,13 @@ write_buffer(Buffer buffer, bool release) ...@@ -553,11 +494,13 @@ write_buffer(Buffer buffer, bool release)
bufHdr = &BufferDescriptors[buffer - 1]; bufHdr = &BufferDescriptors[buffer - 1];
Assert(PrivateRefCount[buffer - 1] > 0);
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
Assert(bufHdr->refcount > 0); Assert(bufHdr->refcount > 0);
/* /*
* If the buffer is not dirty yet, do vacuum cost accounting. * If the buffer was not dirty already, do vacuum cost accounting.
*/ */
if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive) if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
VacuumCostBalance += VacuumCostPageDirty; VacuumCostBalance += VacuumCostPageDirty;
...@@ -714,7 +657,6 @@ BufferSync(int percent, int maxpages) ...@@ -714,7 +657,6 @@ BufferSync(int percent, int maxpages)
BufferTag *buftags; BufferTag *buftags;
int num_buffer_dirty; int num_buffer_dirty;
int i; int i;
ErrorContextCallback errcontext;
/* /*
* Get a list of all currently dirty buffers and how many there are. * Get a list of all currently dirty buffers and how many there are.
...@@ -741,25 +683,14 @@ BufferSync(int percent, int maxpages) ...@@ -741,25 +683,14 @@ BufferSync(int percent, int maxpages)
if (maxpages > 0 && num_buffer_dirty > maxpages) if (maxpages > 0 && num_buffer_dirty > maxpages)
num_buffer_dirty = maxpages; num_buffer_dirty = maxpages;
/* Setup error traceback support for ereport() */
errcontext.callback = buffer_write_error_callback;
errcontext.arg = NULL;
errcontext.previous = error_context_stack;
error_context_stack = &errcontext;
/* /*
* Loop over buffers to be written. Note the BufMgrLock is held at * Loop over buffers to be written. Note the BufMgrLock is held at
* loop top, but is released and reacquired intraloop, so we aren't * loop top, but is released and reacquired within FlushBuffer,
* holding it long. * so we aren't holding it long.
*/ */
for (i = 0; i < num_buffer_dirty; i++) for (i = 0; i < num_buffer_dirty; i++)
{ {
BufferDesc *bufHdr = dirty_buffers[i]; BufferDesc *bufHdr = dirty_buffers[i];
Buffer buffer;
XLogRecPtr recptr;
SMgrRelation reln;
errcontext.arg = bufHdr;
/* /*
* Check it is still the same page and still needs writing. * Check it is still the same page and still needs writing.
...@@ -773,9 +704,9 @@ BufferSync(int percent, int maxpages) ...@@ -773,9 +704,9 @@ BufferSync(int percent, int maxpages)
*/ */
if (!(bufHdr->flags & BM_VALID)) if (!(bufHdr->flags & BM_VALID))
continue; continue;
if (!BUFFERTAGS_EQUAL(&bufHdr->tag, &buftags[i])) if (!BUFFERTAGS_EQUAL(bufHdr->tag, buftags[i]))
continue; continue;
if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty)) if (!(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty))
continue; continue;
/* /*
...@@ -788,9 +719,9 @@ BufferSync(int percent, int maxpages) ...@@ -788,9 +719,9 @@ BufferSync(int percent, int maxpages)
/* Still need writing? */ /* Still need writing? */
if (!(bufHdr->flags & BM_VALID)) if (!(bufHdr->flags & BM_VALID))
continue; continue;
if (!BUFFERTAGS_EQUAL(&bufHdr->tag, &buftags[i])) if (!BUFFERTAGS_EQUAL(bufHdr->tag, buftags[i]))
continue; continue;
if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty)) if (!(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty))
continue; continue;
} }
...@@ -800,78 +731,16 @@ BufferSync(int percent, int maxpages) ...@@ -800,78 +731,16 @@ BufferSync(int percent, int maxpages)
* avoid conflicts with FlushRelationBuffers. * avoid conflicts with FlushRelationBuffers.
*/ */
PinBuffer(bufHdr); PinBuffer(bufHdr);
StartBufferIO(bufHdr, false); /* output IO start */ StartBufferIO(bufHdr, false);
/* Release BufMgrLock while doing xlog work */
LWLockRelease(BufMgrLock);
buffer = BufferDescriptorGetBuffer(bufHdr);
/*
* Protect buffer content against concurrent update
*/
LockBuffer(buffer, BUFFER_LOCK_SHARE);
/*
* Force XLOG flush for buffer' LSN
*/
recptr = BufferGetLSN(bufHdr);
XLogFlush(recptr);
/*
* Now it's safe to write buffer to disk. Note that no one else
* should have been able to write it while we were busy with
* locking and log flushing because we set the IO flag.
*
* Before we issue the actual write command, clear the just-dirtied
* flag. This lets us recognize concurrent changes (note that only
* hint-bit changes are possible since we hold the buffer shlock).
*/
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
Assert(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty);
bufHdr->flags &= ~BM_JUST_DIRTIED;
LWLockRelease(BufMgrLock);
/* Find smgr relation for buffer */
reln = smgropen(bufHdr->tag.rnode);
/* And write... */
smgrwrite(reln,
bufHdr->tag.blockNum,
(char *) MAKE_PTR(bufHdr->data));
/*
* Note that it's safe to change cntxDirty here because of we
* protect it from upper writers by share lock and from other
* bufmgr routines by BM_IO_IN_PROGRESS
*/
bufHdr->cntxDirty = false;
/* FlushBuffer(bufHdr, NULL);
* Release the per-buffer readlock, reacquire BufMgrLock.
*/
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); TerminateBufferIO(bufHdr, 0);
bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */
TerminateBufferIO(bufHdr); /* Sync IO finished */
BufferFlushCount++;
/*
* If this buffer was marked by someone as DIRTY while we were
* flushing it out we must not clear DIRTY flag - vadim 01/17/97
*/
if (!(bufHdr->flags & BM_JUST_DIRTIED))
bufHdr->flags &= ~BM_DIRTY;
UnpinBuffer(bufHdr); UnpinBuffer(bufHdr);
} }
LWLockRelease(BufMgrLock); LWLockRelease(BufMgrLock);
/* Pop the error context stack */
error_context_stack = errcontext.previous;
pfree(dirty_buffers); pfree(dirty_buffers);
pfree(buftags); pfree(buftags);
...@@ -1123,51 +992,96 @@ BufferGetFileNode(Buffer buffer) ...@@ -1123,51 +992,96 @@ BufferGetFileNode(Buffer buffer)
} }
/* /*
* BufferReplace * FlushBuffer
* Physically write out a shared buffer.
* *
* Write out the buffer corresponding to 'bufHdr'. * NOTE: this actually just passes the buffer contents to the kernel; the
* real write to disk won't happen until the kernel feels like it. This
* is okay from our point of view since we can redo the changes from WAL.
* However, we will need to force the changes to disk via sync/fsync
* before we can checkpoint WAL.
* *
* BufMgrLock must be held at entry, and the buffer must be pinned. * BufMgrLock must be held at entry, and the buffer must be pinned. The
* caller is also responsible for doing StartBufferIO/TerminateBufferIO.
*
* If the caller has an smgr reference for the buffer's relation, pass it
* as the second parameter. If not, pass NULL. (Do not open relation
* while holding BufMgrLock!)
*/ */
static void static void
BufferReplace(BufferDesc *bufHdr) FlushBuffer(BufferDesc *buf, SMgrRelation reln)
{ {
SMgrRelation reln; Buffer buffer;
XLogRecPtr recptr; XLogRecPtr recptr;
ErrorContextCallback errcontext; ErrorContextCallback errcontext;
/* Transpose cntxDirty into flags while holding BufMgrLock */
buf->cntxDirty = false;
buf->flags |= BM_DIRTY;
/* To check if block content changed while flushing. - vadim 01/17/97 */ /* To check if block content changed while flushing. - vadim 01/17/97 */
bufHdr->flags &= ~BM_JUST_DIRTIED; buf->flags &= ~BM_JUST_DIRTIED;
/* Release BufMgrLock while doing xlog work */
LWLockRelease(BufMgrLock); LWLockRelease(BufMgrLock);
/* Setup error traceback support for ereport() */ /* Setup error traceback support for ereport() */
errcontext.callback = buffer_write_error_callback; errcontext.callback = buffer_write_error_callback;
errcontext.arg = bufHdr; errcontext.arg = buf;
errcontext.previous = error_context_stack; errcontext.previous = error_context_stack;
error_context_stack = &errcontext; error_context_stack = &errcontext;
/* Find smgr relation for buffer while holding minimal locks */
if (reln == NULL)
reln = smgropen(buf->tag.rnode);
buffer = BufferDescriptorGetBuffer(buf);
/* /*
* No need to lock buffer context - no one should be able to end * Protect buffer content against concurrent update. (Note that
* ReadBuffer * hint-bit updates can still occur while the write is in progress,
* but we assume that that will not invalidate the data written.)
*/ */
recptr = BufferGetLSN(bufHdr); LockBuffer(buffer, BUFFER_LOCK_SHARE);
XLogFlush(recptr);
/* Find smgr relation for buffer */ /*
reln = smgropen(bufHdr->tag.rnode); * Force XLOG flush for buffer' LSN. This implements the basic WAL
* rule that log updates must hit disk before any of the data-file
* changes they describe do.
*/
recptr = BufferGetLSN(buf);
XLogFlush(recptr);
/* And write... */ /*
* Now it's safe to write buffer to disk. Note that no one else
* should have been able to write it while we were busy with
* locking and log flushing because caller has set the IO flag.
*
* It would be better to clear BM_JUST_DIRTIED right here, but we'd
* have to reacquire the BufMgrLock and it doesn't seem worth it.
*/
smgrwrite(reln, smgrwrite(reln,
bufHdr->tag.blockNum, buf->tag.blockNum,
(char *) MAKE_PTR(bufHdr->data)); (char *) MAKE_PTR(buf->data));
/* Pop the error context stack */ /* Pop the error context stack */
error_context_stack = errcontext.previous; error_context_stack = errcontext.previous;
/*
* Release the per-buffer readlock, reacquire BufMgrLock.
*/
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
BufferFlushCount++; BufferFlushCount++;
/*
* If this buffer was marked by someone as DIRTY while we were
* flushing it out we must not clear DIRTY flag - vadim 01/17/97
*/
if (!(buf->flags & BM_JUST_DIRTIED))
buf->flags &= ~BM_DIRTY;
} }
/* /*
...@@ -1490,6 +1404,7 @@ blockNum=%u, flags=0x%x, refcount=%d %ld)", ...@@ -1490,6 +1404,7 @@ blockNum=%u, flags=0x%x, refcount=%d %ld)",
* In all cases, the caller should be holding AccessExclusiveLock on * In all cases, the caller should be holding AccessExclusiveLock on
* the target relation to ensure that no other backend is busy reading * the target relation to ensure that no other backend is busy reading
* more blocks of the relation (or might do so before we commit). * more blocks of the relation (or might do so before we commit).
* This should also ensure that no one is busy dirtying these blocks.
* *
* Formerly, we considered it an error condition if we found dirty * Formerly, we considered it an error condition if we found dirty
* buffers here. However, since BufferSync no longer forces out all * buffers here. However, since BufferSync no longer forces out all
...@@ -1497,7 +1412,7 @@ blockNum=%u, flags=0x%x, refcount=%d %ld)", ...@@ -1497,7 +1412,7 @@ blockNum=%u, flags=0x%x, refcount=%d %ld)",
* to still be present in the cache due to failure of an earlier * to still be present in the cache due to failure of an earlier
* transaction. So, must flush dirty buffers without complaint. * transaction. So, must flush dirty buffers without complaint.
* *
* Returns: 0 - Ok, -1 - FAILED TO WRITE DIRTY BUFFER, -2 - PINNED * Returns: 0 - Ok, -1 - FAILED TO CLEAR DIRTY BIT, -2 - PINNED
* *
* XXX currently it sequentially searches the buffer pool, should be * XXX currently it sequentially searches the buffer pool, should be
* changed to more clever ways of searching. * changed to more clever ways of searching.
...@@ -1508,38 +1423,41 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock) ...@@ -1508,38 +1423,41 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
{ {
int i; int i;
BufferDesc *bufHdr; BufferDesc *bufHdr;
XLogRecPtr recptr;
ErrorContextCallback errcontext;
/* Setup error traceback support for ereport() */
errcontext.callback = buffer_write_error_callback;
errcontext.arg = NULL;
errcontext.previous = error_context_stack;
error_context_stack = &errcontext;
if (rel->rd_istemp) if (rel->rd_istemp)
{ {
for (i = 0; i < NLocBuffer; i++) for (i = 0; i < NLocBuffer; i++)
{ {
bufHdr = &LocalBufferDescriptors[i]; bufHdr = &LocalBufferDescriptors[i];
errcontext.arg = bufHdr;
if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
{ {
if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) if ((bufHdr->flags & BM_VALID) &&
(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty))
{ {
/* Open it at the smgr level if not already done */ ErrorContextCallback errcontext;
/* Setup error traceback support for ereport() */
errcontext.callback = buffer_write_error_callback;
errcontext.arg = bufHdr;
errcontext.previous = error_context_stack;
error_context_stack = &errcontext;
/* Open rel at the smgr level if not already done */
if (rel->rd_smgr == NULL) if (rel->rd_smgr == NULL)
rel->rd_smgr = smgropen(rel->rd_node); rel->rd_smgr = smgropen(rel->rd_node);
smgrwrite(rel->rd_smgr, smgrwrite(rel->rd_smgr,
bufHdr->tag.blockNum, bufHdr->tag.blockNum,
(char *) MAKE_PTR(bufHdr->data)); (char *) MAKE_PTR(bufHdr->data));
bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
bufHdr->cntxDirty = false; bufHdr->cntxDirty = false;
/* Pop the error context stack */
error_context_stack = errcontext.previous;
} }
if (LocalRefCount[i] > 0) if (LocalRefCount[i] > 0)
{ {
error_context_stack = errcontext.previous;
elog(WARNING, "FlushRelationBuffers(\"%s\" (local), %u): block %u is referenced (%ld)", elog(WARNING, "FlushRelationBuffers(\"%s\" (local), %u): block %u is referenced (%ld)",
RelationGetRelationName(rel), firstDelBlock, RelationGetRelationName(rel), firstDelBlock,
bufHdr->tag.blockNum, LocalRefCount[i]); bufHdr->tag.blockNum, LocalRefCount[i]);
...@@ -1550,9 +1468,6 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock) ...@@ -1550,9 +1468,6 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
} }
} }
/* Pop the error context stack */
error_context_stack = errcontext.previous;
return 0; return 0;
} }
...@@ -1561,67 +1476,37 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock) ...@@ -1561,67 +1476,37 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
for (i = 0; i < NBuffers; i++) for (i = 0; i < NBuffers; i++)
{ {
bufHdr = &BufferDescriptors[i]; bufHdr = &BufferDescriptors[i];
errcontext.arg = bufHdr;
if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
{ {
if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) if ((bufHdr->flags & BM_VALID) &&
(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty))
{ {
PinBuffer(bufHdr); PinBuffer(bufHdr);
/* Someone else might be flushing buffer */
if (bufHdr->flags & BM_IO_IN_PROGRESS) if (bufHdr->flags & BM_IO_IN_PROGRESS)
WaitIO(bufHdr); WaitIO(bufHdr);
LWLockRelease(BufMgrLock); /* Still dirty? */
/*
* Force XLOG flush for buffer' LSN
*/
recptr = BufferGetLSN(bufHdr);
XLogFlush(recptr);
/*
* Now it's safe to write buffer to disk
*/
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
if (bufHdr->flags & BM_IO_IN_PROGRESS)
WaitIO(bufHdr);
if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty)
{ {
bufHdr->flags &= ~BM_JUST_DIRTIED; StartBufferIO(bufHdr, false);
StartBufferIO(bufHdr, false); /* output IO start */
LWLockRelease(BufMgrLock);
/* Open it at the smgr level if not already done */
if (rel->rd_smgr == NULL)
rel->rd_smgr = smgropen(rel->rd_node);
smgrwrite(rel->rd_smgr,
bufHdr->tag.blockNum,
(char *) MAKE_PTR(bufHdr->data));
BufferFlushCount++;
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); FlushBuffer(bufHdr, rel->rd_smgr);
bufHdr->flags &= ~BM_IO_IN_PROGRESS;
TerminateBufferIO(bufHdr);
Assert(!(bufHdr->flags & BM_JUST_DIRTIED));
bufHdr->flags &= ~BM_DIRTY;
/* TerminateBufferIO(bufHdr, 0);
* Note that it's safe to change cntxDirty here
* because of we protect it from upper writers by
* AccessExclusiveLock and from other bufmgr routines
* by BM_IO_IN_PROGRESS
*/
bufHdr->cntxDirty = false;
} }
UnpinBuffer(bufHdr); UnpinBuffer(bufHdr);
if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty)
{
LWLockRelease(BufMgrLock);
elog(WARNING, "FlushRelationBuffers(\"%s\", %u): block %u was re-dirtied",
RelationGetRelationName(rel), firstDelBlock,
bufHdr->tag.blockNum);
return -1;
}
} }
if (bufHdr->refcount != 0) if (bufHdr->refcount != 0)
{ {
LWLockRelease(BufMgrLock); LWLockRelease(BufMgrLock);
error_context_stack = errcontext.previous;
elog(WARNING, "FlushRelationBuffers(\"%s\", %u): block %u is referenced (private %ld, global %d)", elog(WARNING, "FlushRelationBuffers(\"%s\", %u): block %u is referenced (private %ld, global %d)",
RelationGetRelationName(rel), firstDelBlock, RelationGetRelationName(rel), firstDelBlock,
bufHdr->tag.blockNum, bufHdr->tag.blockNum,
...@@ -1635,9 +1520,6 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock) ...@@ -1635,9 +1520,6 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
LWLockRelease(BufMgrLock); LWLockRelease(BufMgrLock);
/* Pop the error context stack */
error_context_stack = errcontext.previous;
return 0; return 0;
} }
...@@ -1645,7 +1527,7 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock) ...@@ -1645,7 +1527,7 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
* ReleaseBuffer -- remove the pin on a buffer without * ReleaseBuffer -- remove the pin on a buffer without
* marking it dirty. * marking it dirty.
*/ */
int void
ReleaseBuffer(Buffer buffer) ReleaseBuffer(Buffer buffer)
{ {
BufferDesc *bufHdr; BufferDesc *bufHdr;
...@@ -1654,15 +1536,16 @@ ReleaseBuffer(Buffer buffer) ...@@ -1654,15 +1536,16 @@ ReleaseBuffer(Buffer buffer)
{ {
Assert(LocalRefCount[-buffer - 1] > 0); Assert(LocalRefCount[-buffer - 1] > 0);
LocalRefCount[-buffer - 1]--; LocalRefCount[-buffer - 1]--;
return STATUS_OK; return;
} }
if (BAD_BUFFER_ID(buffer)) if (BAD_BUFFER_ID(buffer))
return STATUS_ERROR; elog(ERROR, "bad buffer id: %d", buffer);
bufHdr = &BufferDescriptors[buffer - 1]; bufHdr = &BufferDescriptors[buffer - 1];
Assert(PrivateRefCount[buffer - 1] > 0); Assert(PrivateRefCount[buffer - 1] > 0);
if (PrivateRefCount[buffer - 1] > 1) if (PrivateRefCount[buffer - 1] > 1)
PrivateRefCount[buffer - 1]--; PrivateRefCount[buffer - 1]--;
else else
...@@ -1671,8 +1554,6 @@ ReleaseBuffer(Buffer buffer) ...@@ -1671,8 +1554,6 @@ ReleaseBuffer(Buffer buffer)
UnpinBuffer(bufHdr); UnpinBuffer(bufHdr);
LWLockRelease(BufMgrLock); LWLockRelease(BufMgrLock);
} }
return STATUS_OK;
} }
#ifdef NOT_USED #ifdef NOT_USED
...@@ -2021,16 +1902,24 @@ StartBufferIO(BufferDesc *buf, bool forInput) ...@@ -2021,16 +1902,24 @@ StartBufferIO(BufferDesc *buf, bool forInput)
* (Assumptions) * (Assumptions)
* My process is executing IO for the buffer * My process is executing IO for the buffer
* BufMgrLock is held * BufMgrLock is held
* BM_IO_IN_PROGRESS mask is set for the buffer
* The buffer is Pinned * The buffer is Pinned
* *
* err_flag must be 0 for successful completion and BM_IO_ERROR for failure.
*
* Because BufMgrLock is held, we are already in an interrupt holdoff here, * Because BufMgrLock is held, we are already in an interrupt holdoff here,
* and do not need another. * and do not need another.
*/ */
static void static void
TerminateBufferIO(BufferDesc *buf) TerminateBufferIO(BufferDesc *buf, int err_flag)
{ {
Assert(buf == InProgressBuf); Assert(buf == InProgressBuf);
Assert(buf->flags & BM_IO_IN_PROGRESS);
buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
buf->flags |= err_flag;
LWLockRelease(buf->io_in_progress_lock); LWLockRelease(buf->io_in_progress_lock);
InProgressBuf = NULL; InProgressBuf = NULL;
} }
...@@ -2062,7 +1951,8 @@ InitBufferIO(void) ...@@ -2062,7 +1951,8 @@ InitBufferIO(void)
/* /*
* Clean up any active buffer I/O after an error. * Clean up any active buffer I/O after an error.
* BufMgrLock isn't held when this function is called. * BufMgrLock isn't held when this function is called,
* but we haven't yet released buffer pins, so the buffer is still pinned.
* *
* If I/O was in progress, we always set BM_IO_ERROR. * If I/O was in progress, we always set BM_IO_ERROR.
*/ */
...@@ -2086,9 +1976,9 @@ AbortBufferIO(void) ...@@ -2086,9 +1976,9 @@ AbortBufferIO(void)
Assert(buf->flags & BM_IO_IN_PROGRESS); Assert(buf->flags & BM_IO_IN_PROGRESS);
if (IsForInput) if (IsForInput)
{ {
Assert(!(buf->flags & BM_DIRTY) && !(buf->cntxDirty)); Assert(!(buf->flags & BM_DIRTY || buf->cntxDirty));
/* Don't think that buffer is valid */ /* We'd better not think buffer is valid yet */
StrategyInvalidateBuffer(buf); Assert(!(buf->flags & BM_VALID));
} }
else else
{ {
...@@ -2106,9 +1996,7 @@ AbortBufferIO(void) ...@@ -2106,9 +1996,7 @@ AbortBufferIO(void)
} }
buf->flags |= BM_DIRTY; buf->flags |= BM_DIRTY;
} }
buf->flags |= BM_IO_ERROR; TerminateBufferIO(buf, BM_IO_ERROR);
buf->flags &= ~BM_IO_IN_PROGRESS;
TerminateBufferIO(buf);
LWLockRelease(BufMgrLock); LWLockRelease(BufMgrLock);
} }
} }
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/buffer/freelist.c,v 1.42 2004/04/19 23:27:17 tgl Exp $ * $PostgreSQL: pgsql/src/backend/storage/buffer/freelist.c,v 1.43 2004/04/21 18:06:30 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -501,7 +501,7 @@ StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag, ...@@ -501,7 +501,7 @@ StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
/* Assert that the buffer remembered in cdb_found is the one */ /* Assert that the buffer remembered in cdb_found is the one */
/* the buffer manager is currently faulting in */ /* the buffer manager is currently faulting in */
Assert(BUFFERTAGS_EQUAL(&(cdb_found->buf_tag), newTag)); Assert(BUFFERTAGS_EQUAL(cdb_found->buf_tag, *newTag));
if (cdb_replace_index >= 0) if (cdb_replace_index >= 0)
{ {
...@@ -513,7 +513,7 @@ StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag, ...@@ -513,7 +513,7 @@ StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
Assert(cdb_replace->list == STRAT_LIST_T1 || Assert(cdb_replace->list == STRAT_LIST_T1 ||
cdb_replace->list == STRAT_LIST_T2); cdb_replace->list == STRAT_LIST_T2);
Assert(cdb_replace->buf_id == buf->buf_id); Assert(cdb_replace->buf_id == buf->buf_id);
Assert(BUFFERTAGS_EQUAL(&(cdb_replace->buf_tag), &(buf->tag))); Assert(BUFFERTAGS_EQUAL(cdb_replace->buf_tag, buf->tag));
/* /*
* Under normal circumstances we move the evicted T list entry to * Under normal circumstances we move the evicted T list entry to
...@@ -606,7 +606,7 @@ StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag, ...@@ -606,7 +606,7 @@ StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
Assert(cdb_replace->list == STRAT_LIST_T1 || Assert(cdb_replace->list == STRAT_LIST_T1 ||
cdb_replace->list == STRAT_LIST_T2); cdb_replace->list == STRAT_LIST_T2);
Assert(cdb_replace->buf_id == buf->buf_id); Assert(cdb_replace->buf_id == buf->buf_id);
Assert(BUFFERTAGS_EQUAL(&(cdb_replace->buf_tag), &(buf->tag))); Assert(BUFFERTAGS_EQUAL(cdb_replace->buf_tag, buf->tag));
if (cdb_replace->list == STRAT_LIST_T1) if (cdb_replace->list == STRAT_LIST_T1)
{ {
...@@ -673,7 +673,7 @@ StrategyInvalidateBuffer(BufferDesc *buf) ...@@ -673,7 +673,7 @@ StrategyInvalidateBuffer(BufferDesc *buf)
BufferStrategyCDB *cdb; BufferStrategyCDB *cdb;
/* The buffer cannot be dirty or pinned */ /* The buffer cannot be dirty or pinned */
Assert(!(buf->flags & BM_DIRTY)); Assert(!(buf->flags & BM_DIRTY) || !(buf->flags & BM_VALID));
Assert(buf->refcount == 0); Assert(buf->refcount == 0);
/* /*
...@@ -695,16 +695,18 @@ StrategyInvalidateBuffer(BufferDesc *buf) ...@@ -695,16 +695,18 @@ StrategyInvalidateBuffer(BufferDesc *buf)
* Clear out the CDB's buffer tag and association with the buffer * Clear out the CDB's buffer tag and association with the buffer
* and add it to the list of unused CDB's * and add it to the list of unused CDB's
*/ */
CLEAR_BUFFERTAG(&(cdb->buf_tag)); CLEAR_BUFFERTAG(cdb->buf_tag);
cdb->buf_id = -1; cdb->buf_id = -1;
cdb->next = StrategyControl->listUnusedCDB; cdb->next = StrategyControl->listUnusedCDB;
StrategyControl->listUnusedCDB = cdb_id; StrategyControl->listUnusedCDB = cdb_id;
/* /*
* Clear out the buffer's tag and add it to the list of * Clear out the buffer's tag and add it to the list of
* currently unused buffers. * currently unused buffers. We must do this to ensure that linear
* scans of the buffer array don't think the buffer is valid.
*/ */
CLEAR_BUFFERTAG(&(buf->tag)); CLEAR_BUFFERTAG(buf->tag);
buf->flags &= ~(BM_VALID | BM_DIRTY);
buf->bufNext = StrategyControl->listFreeBuffers; buf->bufNext = StrategyControl->listFreeBuffers;
StrategyControl->listFreeBuffers = buf->buf_id; StrategyControl->listFreeBuffers = buf->buf_id;
} }
...@@ -864,7 +866,7 @@ StrategyInitialize(bool init) ...@@ -864,7 +866,7 @@ StrategyInitialize(bool init)
{ {
StrategyCDB[i].next = i + 1; StrategyCDB[i].next = i + 1;
StrategyCDB[i].list = STRAT_LIST_UNUSED; StrategyCDB[i].list = STRAT_LIST_UNUSED;
CLEAR_BUFFERTAG(&(StrategyCDB[i].buf_tag)); CLEAR_BUFFERTAG(StrategyCDB[i].buf_tag);
StrategyCDB[i].buf_id = -1; StrategyCDB[i].buf_id = -1;
} }
StrategyCDB[NBuffers * 2 - 1].next = -1; StrategyCDB[NBuffers * 2 - 1].next = -1;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.52 2004/02/10 01:55:25 tgl Exp $ * $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.53 2004/04/21 18:06:30 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -36,19 +36,25 @@ static int nextFreeLocalBuf = 0; ...@@ -36,19 +36,25 @@ static int nextFreeLocalBuf = 0;
/* /*
* LocalBufferAlloc - * LocalBufferAlloc -
* allocate a local buffer. We do round robin allocation for now. * allocate a local buffer. We do round robin allocation for now.
*
* API is similar to bufmgr.c's BufferAlloc, except that we do not need
* to have the BufMgrLock since this is all local. Also, IO_IN_PROGRESS
* does not get set.
*/ */
BufferDesc * BufferDesc *
LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
{ {
BufferTag newTag; /* identity of requested block */
int i; int i;
BufferDesc *bufHdr = NULL; BufferDesc *bufHdr;
INIT_BUFFERTAG(newTag, reln, blockNum);
/* a low tech search for now -- not optimized for scans */ /* a low tech search for now -- not optimized for scans */
for (i = 0; i < NLocBuffer; i++) for (i = 0; i < NLocBuffer; i++)
{ {
if (LocalBufferDescriptors[i].tag.rnode.relNode == bufHdr = &LocalBufferDescriptors[i];
reln->rd_node.relNode && if (BUFFERTAGS_EQUAL(bufHdr->tag, newTag))
LocalBufferDescriptors[i].tag.blockNum == blockNum)
{ {
#ifdef LBDEBUG #ifdef LBDEBUG
fprintf(stderr, "LB ALLOC (%u,%d) %d\n", fprintf(stderr, "LB ALLOC (%u,%d) %d\n",
...@@ -56,8 +62,14 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) ...@@ -56,8 +62,14 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
#endif #endif
LocalRefCount[i]++; LocalRefCount[i]++;
*foundPtr = TRUE; if (bufHdr->flags & BM_VALID)
return &LocalBufferDescriptors[i]; *foundPtr = TRUE;
else
{
/* Previous read attempt must have failed; try again */
*foundPtr = FALSE;
}
return bufHdr;
} }
} }
...@@ -67,6 +79,7 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) ...@@ -67,6 +79,7 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
#endif #endif
/* need to get a new buffer (round robin for now) */ /* need to get a new buffer (round robin for now) */
bufHdr = NULL;
for (i = 0; i < NLocBuffer; i++) for (i = 0; i < NLocBuffer; i++)
{ {
int b = (nextFreeLocalBuf + i) % NLocBuffer; int b = (nextFreeLocalBuf + i) % NLocBuffer;
...@@ -108,7 +121,7 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) ...@@ -108,7 +121,7 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
* *
* Note this path cannot be taken for a buffer that was previously in * Note this path cannot be taken for a buffer that was previously in
* use, so it's okay to do it (and possibly error out) before marking * use, so it's okay to do it (and possibly error out) before marking
* the buffer as valid. * the buffer as not dirty.
*/ */
if (bufHdr->data == (SHMEM_OFFSET) 0) if (bufHdr->data == (SHMEM_OFFSET) 0)
{ {
...@@ -135,9 +148,8 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) ...@@ -135,9 +148,8 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
/* /*
* it's all ours now. * it's all ours now.
*/ */
bufHdr->tag.rnode = reln->rd_node; bufHdr->tag = newTag;
bufHdr->tag.blockNum = blockNum; bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
bufHdr->flags &= ~BM_DIRTY;
bufHdr->cntxDirty = false; bufHdr->cntxDirty = false;
*foundPtr = FALSE; *foundPtr = FALSE;
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.69 2004/04/19 23:27:17 tgl Exp $ * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.70 2004/04/21 18:06:29 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -24,13 +24,12 @@ ...@@ -24,13 +24,12 @@
/* /*
* Flags for buffer descriptors * Flags for buffer descriptors
*/ */
#define BM_DIRTY (1 << 0) #define BM_DIRTY (1 << 0) /* data needs writing */
#define BM_VALID (1 << 1) #define BM_VALID (1 << 1) /* data is valid */
#define BM_DELETED (1 << 2) #define BM_IO_IN_PROGRESS (1 << 2) /* read or write in progress */
#define BM_IO_IN_PROGRESS (1 << 3) #define BM_IO_ERROR (1 << 3) /* previous I/O failed */
#define BM_IO_ERROR (1 << 4) #define BM_JUST_DIRTIED (1 << 4) /* dirtied since write started */
#define BM_JUST_DIRTIED (1 << 5) #define BM_PIN_COUNT_WAITER (1 << 5) /* have waiter for sole pin */
#define BM_PIN_COUNT_WAITER (1 << 6)
typedef bits16 BufFlags; typedef bits16 BufFlags;
...@@ -54,22 +53,21 @@ typedef struct buftag ...@@ -54,22 +53,21 @@ typedef struct buftag
#define CLEAR_BUFFERTAG(a) \ #define CLEAR_BUFFERTAG(a) \
( \ ( \
(a)->rnode.tblNode = InvalidOid, \ (a).rnode.tblNode = InvalidOid, \
(a)->rnode.relNode = InvalidOid, \ (a).rnode.relNode = InvalidOid, \
(a)->blockNum = InvalidBlockNumber \ (a).blockNum = InvalidBlockNumber \
) )
#define INIT_BUFFERTAG(a,xx_reln,xx_blockNum) \ #define INIT_BUFFERTAG(a,xx_reln,xx_blockNum) \
( \ ( \
(a)->blockNum = (xx_blockNum), \ (a).rnode = (xx_reln)->rd_node, \
(a)->rnode = (xx_reln)->rd_node \ (a).blockNum = (xx_blockNum) \
) )
#define BUFFERTAGS_EQUAL(a,b) \ #define BUFFERTAGS_EQUAL(a,b) \
( \ ( \
(a)->rnode.tblNode == (b)->rnode.tblNode && \ RelFileNodeEquals((a).rnode, (b).rnode) && \
(a)->rnode.relNode == (b)->rnode.relNode && \ (a).blockNum == (b).blockNum \
(a)->blockNum == (b)->blockNum \
) )
/* /*
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.75 2004/02/04 01:24:53 wieck Exp $ * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.76 2004/04/21 18:06:30 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -142,7 +142,7 @@ extern long *LocalRefCount; ...@@ -142,7 +142,7 @@ extern long *LocalRefCount;
* prototypes for functions in bufmgr.c * prototypes for functions in bufmgr.c
*/ */
extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum); extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
extern int ReleaseBuffer(Buffer buffer); extern void ReleaseBuffer(Buffer buffer);
extern void WriteBuffer(Buffer buffer); extern void WriteBuffer(Buffer buffer);
extern void WriteNoReleaseBuffer(Buffer buffer); extern void WriteNoReleaseBuffer(Buffer buffer);
extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment