Commit 9b178555 authored by Tom Lane's avatar Tom Lane

Per previous discussions, get rid of use of sync(2) in favor of

explicitly fsync'ing every (non-temp) file we have written since the
last checkpoint.  In the vast majority of cases, the burden of the
fsyncs should fall on the bgwriter process not on backends.  (To this
end, we assume that an fsync issued by the bgwriter will force out
blocks written to the same file by other processes using other file
descriptors.  Anyone have a problem with that?)  This makes the world
safe for WIN32, which ain't even got sync(2), and really makes the world
safe for Unixen as well, because sync(2) never had the semantics we need:
it offers no way to wait for the requested I/O to finish.

Along the way, fix a bug I recently introduced in xlog recovery:
file truncation replay failed to clear bufmgr buffers for the dropped
blocks, which could result in 'PANIC:  heap_delete_redo: no block'
later on in xlog replay.
parent f024086d
......@@ -13,7 +13,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.19 2003/11/29 19:51:40 pgsql Exp $
* $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.20 2004/05/31 03:47:54 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -97,7 +97,7 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
Assert(status == TRANSACTION_STATUS_COMMITTED ||
status == TRANSACTION_STATUS_ABORTED);
LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
byteptr = SimpleLruReadPage(ClogCtl, pageno, xid, true);
byteptr += byteno;
......@@ -110,7 +110,7 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
/* ...->page_status[slotno] = CLOG_PAGE_DIRTY; already done */
LWLockRelease(ClogCtl->locks->ControlLock);
LWLockRelease(ClogCtl->ControlLock);
}
/*
......@@ -128,14 +128,14 @@ TransactionIdGetStatus(TransactionId xid)
char *byteptr;
XidStatus status;
LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
byteptr = SimpleLruReadPage(ClogCtl, pageno, xid, false);
byteptr += byteno;
status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
LWLockRelease(ClogCtl->locks->ControlLock);
LWLockRelease(ClogCtl->ControlLock);
return status;
}
......@@ -169,16 +169,16 @@ BootStrapCLOG(void)
{
int slotno;
LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
/* Create and zero the first page of the commit log */
slotno = ZeroCLOGPage(0, false);
/* Make sure it's written out */
SimpleLruWritePage(ClogCtl, slotno);
SimpleLruWritePage(ClogCtl, slotno, NULL);
/* Assert(ClogCtl->page_status[slotno] == CLOG_PAGE_CLEAN); */
LWLockRelease(ClogCtl->locks->ControlLock);
LWLockRelease(ClogCtl->ControlLock);
}
/*
......@@ -256,12 +256,12 @@ ExtendCLOG(TransactionId newestXact)
pageno = TransactionIdToPage(newestXact);
LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
/* Zero the page and make an XLOG entry about it */
ZeroCLOGPage(pageno, true);
LWLockRelease(ClogCtl->locks->ControlLock);
LWLockRelease(ClogCtl->ControlLock);
}
......@@ -351,13 +351,13 @@ clog_redo(XLogRecPtr lsn, XLogRecord *record)
memcpy(&pageno, XLogRecGetData(record), sizeof(int));
LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
slotno = ZeroCLOGPage(pageno, false);
SimpleLruWritePage(ClogCtl, slotno);
SimpleLruWritePage(ClogCtl, slotno, NULL);
/* Assert(ClogCtl->page_status[slotno] == SLRU_PAGE_CLEAN); */
LWLockRelease(ClogCtl->locks->ControlLock);
LWLockRelease(ClogCtl->ControlLock);
}
}
......
This diff is collapsed.
This diff is collapsed.
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.166 2004/05/29 22:48:19 tgl Exp $
* $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.167 2004/05/31 03:48:02 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -1044,6 +1044,9 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
* bothering to write them out first. This is NOT rollback-able,
* and so should be used only with extreme caution!
*
* There is no particularly good reason why this doesn't have a
* firstDelBlock parameter, except that current callers don't need it.
*
* We assume that the caller holds an exclusive lock on the relation,
* which should assure that no new buffers will be acquired for the rel
* meanwhile.
......@@ -1052,14 +1055,15 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
void
DropRelationBuffers(Relation rel)
{
DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp);
DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp, 0);
}
/* ---------------------------------------------------------------------
* DropRelFileNodeBuffers
*
* This is the same as DropRelationBuffers, except that the target
* relation is specified by RelFileNode and temp status.
* relation is specified by RelFileNode and temp status, and one
* may specify the first block to drop.
*
* This is NOT rollback-able. One legitimate use is to clear the
* buffer cache of buffers for a relation that is being deleted
......@@ -1067,7 +1071,8 @@ DropRelationBuffers(Relation rel)
* --------------------------------------------------------------------
*/
void
DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
BlockNumber firstDelBlock)
{
int i;
BufferDesc *bufHdr;
......@@ -1077,7 +1082,8 @@ DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
for (i = 0; i < NLocBuffer; i++)
{
bufHdr = &LocalBufferDescriptors[i];
if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
bufHdr->tag.blockNum >= firstDelBlock)
{
bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
bufHdr->cntxDirty = false;
......@@ -1094,7 +1100,8 @@ DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
{
bufHdr = &BufferDescriptors[i - 1];
recheck:
if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
bufHdr->tag.blockNum >= firstDelBlock)
{
/*
* If there is I/O in progress, better wait till it's done;
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.108 2004/02/23 23:03:10 tgl Exp $
* $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.109 2004/05/31 03:48:04 tgl Exp $
*
* NOTES:
*
......@@ -484,6 +484,7 @@ Insert(File file)
DO_DB(_dump_lru());
}
/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
LruInsert(File file)
{
......@@ -685,6 +686,7 @@ filepath(const char *filename)
return buf;
}
/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
FileAccess(File file)
{
......@@ -954,7 +956,10 @@ FileRead(File file, char *buffer, int amount)
file, VfdCache[file].fileName,
VfdCache[file].seekPos, amount, buffer));
FileAccess(file);
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
returnCode = read(VfdCache[file].fd, buffer, amount);
if (returnCode > 0)
VfdCache[file].seekPos += returnCode;
......@@ -975,7 +980,9 @@ FileWrite(File file, char *buffer, int amount)
file, VfdCache[file].fileName,
VfdCache[file].seekPos, amount, buffer));
FileAccess(file);
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
errno = 0;
returnCode = write(VfdCache[file].fd, buffer, amount);
......@@ -992,9 +999,28 @@ FileWrite(File file, char *buffer, int amount)
return returnCode;
}
int
FileSync(File file)
{
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileSync: %d (%s)",
file, VfdCache[file].fileName));
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
return pg_fsync(VfdCache[file].fd);
}
long
FileSeek(File file, long offset, int whence)
{
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileSeek: %d (%s) %ld %ld %d",
......@@ -1014,8 +1040,11 @@ FileSeek(File file, long offset, int whence)
VfdCache[file].seekPos += offset;
break;
case SEEK_END:
FileAccess(file);
VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
VfdCache[file].seekPos = lseek(VfdCache[file].fd,
offset, whence);
break;
default:
elog(ERROR, "invalid whence: %d", whence);
......@@ -1030,14 +1059,17 @@ FileSeek(File file, long offset, int whence)
if (offset < 0)
elog(ERROR, "invalid seek offset: %ld", offset);
if (VfdCache[file].seekPos != offset)
VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
VfdCache[file].seekPos = lseek(VfdCache[file].fd,
offset, whence);
break;
case SEEK_CUR:
if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
VfdCache[file].seekPos = lseek(VfdCache[file].fd,
offset, whence);
break;
case SEEK_END:
VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
VfdCache[file].seekPos = lseek(VfdCache[file].fd,
offset, whence);
break;
default:
elog(ERROR, "invalid whence: %d", whence);
......@@ -1071,7 +1103,10 @@ FileTruncate(File file, long offset)
DO_DB(elog(LOG, "FileTruncate %d (%s)",
file, VfdCache[file].fileName));
FileAccess(file);
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
returnCode = ftruncate(VfdCache[file].fd, (size_t) offset);
return returnCode;
}
......
This diff is collapsed.
......@@ -11,7 +11,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.70 2004/02/11 22:55:25 tgl Exp $
* $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.71 2004/05/31 03:48:06 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -56,7 +56,7 @@ typedef struct f_smgr
static const f_smgr smgrsw[] = {
/* magnetic disk */
{mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend,
mdread, mdwrite, mdnblocks, mdtruncate, mdcommit, mdabort, mdsync
mdread, mdwrite, mdnblocks, mdtruncate, NULL, NULL, mdsync
}
};
......@@ -407,7 +407,7 @@ smgr_internal_unlink(RelFileNode rnode, int which, bool isTemp, bool isRedo)
* Get rid of any leftover buffers for the rel (shouldn't be any in the
* commit case, but there can be in the abort case).
*/
DropRelFileNodeBuffers(rnode, isTemp);
DropRelFileNodeBuffers(rnode, isTemp, 0);
/*
* Tell the free space map to forget this relation. It won't be accessed
......@@ -638,7 +638,7 @@ smgrcommit(void)
if (smgrsw[i].smgr_commit)
{
if (! (*(smgrsw[i].smgr_commit)) ())
elog(FATAL, "transaction commit failed on %s: %m",
elog(ERROR, "transaction commit failed on %s: %m",
DatumGetCString(DirectFunctionCall1(smgrout,
Int16GetDatum(i))));
}
......@@ -658,7 +658,7 @@ smgrabort(void)
if (smgrsw[i].smgr_abort)
{
if (! (*(smgrsw[i].smgr_abort)) ())
elog(FATAL, "transaction abort failed on %s: %m",
elog(ERROR, "transaction abort failed on %s: %m",
DatumGetCString(DirectFunctionCall1(smgrout,
Int16GetDatum(i))));
}
......@@ -678,7 +678,7 @@ smgrsync(void)
if (smgrsw[i].smgr_sync)
{
if (! (*(smgrsw[i].smgr_sync)) ())
elog(PANIC, "storage sync failed on %s: %m",
elog(ERROR, "storage sync failed on %s: %m",
DatumGetCString(DirectFunctionCall1(smgrout,
Int16GetDatum(i))));
}
......@@ -707,6 +707,13 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record)
reln = smgropen(xlrec->rnode);
/*
* First, force bufmgr to drop any buffers it has for the to-be-
* truncated blocks. We must do this, else subsequent XLogReadBuffer
* operations will not re-extend the file properly.
*/
DropRelFileNodeBuffers(xlrec->rnode, false, xlrec->blkno);
/* Can't use smgrtruncate because it would try to xlog */
/*
......
......@@ -6,26 +6,17 @@
* Portions Copyright (c) 2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/access/slru.h,v 1.5 2004/05/28 05:13:17 tgl Exp $
* $PostgreSQL: pgsql/src/include/access/slru.h,v 1.6 2004/05/31 03:48:08 tgl Exp $
*/
#ifndef SLRU_H
#define SLRU_H
#include "access/xlog.h"
#include "storage/lwlock.h"
/* exported because lwlock.c needs it */
#define NUM_CLOG_BUFFERS 8
/*
* Note: the separation between SlruLockData and SlruSharedData is purely
* historical; the structs could be combined.
*/
typedef struct SlruLockData
{
LWLockId ControlLock;
LWLockId BufferLocks[NUM_CLOG_BUFFERS]; /* Per-buffer I/O locks */
} SlruLockData;
typedef SlruLockData *SlruLock;
/* Opaque structs known only in slru.c */
typedef struct SlruSharedData *SlruShared;
typedef struct SlruFlushData *SlruFlush;
/*
* SlruCtlData is an unshared structure that points to the active information
......@@ -33,13 +24,13 @@ typedef SlruLockData *SlruLock;
*/
typedef struct SlruCtlData
{
void *shared; /* pointer to SlruSharedData */
SlruLock locks;
SlruShared shared;
LWLockId ControlLock;
/*
* Dir is set during SimpleLruShmemInit and does not change thereafter.
* The value is automatically inherited by backends via fork, and
* doesn't need to be in shared memory.
* Dir is set during SimpleLruInit and does not change thereafter.
* Since it's always the same, it doesn't need to be in shared memory.
*/
char Dir[MAXPGPATH];
......@@ -51,13 +42,16 @@ typedef struct SlruCtlData
bool (*PagePrecedes) (int, int);
} SlruCtlData;
typedef SlruCtlData *SlruCtl;
extern int SimpleLruShmemSize(void);
extern void SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir);
extern int SimpleLruZeroPage(SlruCtl ctl, int pageno);
extern char *SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite);
extern void SimpleLruWritePage(SlruCtl ctl, int slotno);
extern char *SimpleLruReadPage(SlruCtl ctl, int pageno,
TransactionId xid, bool forwrite);
extern void SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata);
extern void SimpleLruSetLatestPage(SlruCtl ctl, int pageno);
extern void SimpleLruFlush(SlruCtl ctl, bool checkpoint);
extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage);
......
......@@ -5,13 +5,17 @@
*
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
*
* $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.1 2004/05/29 22:48:23 tgl Exp $
* $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.2 2004/05/31 03:48:09 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef _BGWRITER_H
#define _BGWRITER_H
#include "storage/block.h"
#include "storage/relfilenode.h"
/* GUC options */
extern int BgWriterDelay;
extern int BgWriterPercent;
......@@ -23,6 +27,9 @@ extern void BackgroundWriterMain(void);
extern void RequestCheckpoint(bool waitforit);
extern bool ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno);
extern void AbsorbFsyncRequests(void);
extern int BgWriterShmemSize(void);
extern void BgWriterShmemInit(void);
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.80 2004/05/29 22:48:23 tgl Exp $
* $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.81 2004/05/31 03:48:10 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -154,7 +154,8 @@ extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
extern void RelationTruncate(Relation rel, BlockNumber nblocks);
extern int FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock);
extern void DropRelationBuffers(Relation rel);
extern void DropRelFileNodeBuffers(RelFileNode rnode, bool istemp);
extern void DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
BlockNumber firstDelBlock);
extern void DropBuffers(Oid dbid);
#ifdef NOT_USED
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.44 2004/02/23 23:03:10 tgl Exp $
* $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.45 2004/05/31 03:48:10 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -15,7 +15,7 @@
/*
* calls:
*
* File {Close, Read, Write, Seek, Tell, MarkDirty, Sync}
* File {Close, Read, Write, Seek, Tell, Sync}
* {File Name Open, Allocate, Free} File
*
* These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
......@@ -66,6 +66,7 @@ extern void FileClose(File file);
extern void FileUnlink(File file);
extern int FileRead(File file, char *buffer, int amount);
extern int FileWrite(File file, char *buffer, int amount);
extern int FileSync(File file);
extern long FileSeek(File file, long offset, int whence);
extern int FileTruncate(File file, long offset);
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.10 2003/12/20 17:31:21 momjian Exp $
* $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.11 2004/05/31 03:48:10 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -37,6 +37,7 @@ typedef enum LWLockId
ControlFileLock,
CheckpointLock,
RelCacheInitLock,
BgWriterCommLock,
NumFixedLWLocks, /* must be last except for
* MaxDynamicLWLock */
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.41 2004/02/11 22:55:26 tgl Exp $
* $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.42 2004/05/31 03:48:10 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -83,10 +83,10 @@ extern bool mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer);
extern bool mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer);
extern BlockNumber mdnblocks(SMgrRelation reln);
extern BlockNumber mdtruncate(SMgrRelation reln, BlockNumber nblocks);
extern bool mdcommit(void);
extern bool mdabort(void);
extern bool mdsync(void);
extern void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno);
/* smgrtype.c */
extern Datum smgrout(PG_FUNCTION_ARGS);
extern Datum smgrin(PG_FUNCTION_ARGS);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment