Commit 33cc5d8a authored by Tom Lane's avatar Tom Lane

Change s_lock to not use any zero-delay select() calls; these are just a

waste of cycles on single-CPU machines, and of dubious utility on multi-CPU
machines too.
Tweak s_lock_stuck so that caller can specify timeout interval, and
increase interval before declaring stuck spinlock for buffer locks and XLOG
locks.
On systems that have fdatasync(), use that rather than fsync() to sync WAL
log writes.  Ensure that WAL file is entirely allocated during XLogFileInit.
parent 58c4ab9d
This diff is collapsed.
......@@ -772,7 +772,10 @@ PGAC_VAR_INT_TIMEZONE
AC_FUNC_ACCEPT_ARGTYPES
PGAC_FUNC_GETTIMEOFDAY_1ARG
AC_CHECK_FUNCS([fcvt getopt_long memmove pstat setproctitle setsid sigprocmask sysconf waitpid dlopen])
AC_CHECK_FUNCS([fcvt getopt_long memmove pstat setproctitle setsid sigprocmask sysconf waitpid dlopen fdatasync])
dnl Check whether <unistd.h> declares fdatasync().
AC_EGREP_HEADER(fdatasync, unistd.h, AC_DEFINE(HAVE_FDATASYNC_DECL))
AC_CACHE_CHECK([for PS_STRINGS], [pgac_cv_var_PS_STRINGS],
[AC_TRY_LINK(
......
......@@ -6,7 +6,7 @@
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.53 2001/02/13 20:40:25 vadim Exp $
* $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.54 2001/02/18 04:39:42 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -39,6 +39,13 @@
#include "miscadmin.h"
/* Max time to wait to acquire XLog activity locks */
#define XLOG_LOCK_TIMEOUT (5*60*1000000) /* 5 minutes */
/* Max time to wait to acquire checkpoint lock */
#define CHECKPOINT_LOCK_TIMEOUT (10*60*1000000) /* 10 minutes */
int XLOGbuffers = 8;
int XLOGfiles = 0; /* how many files to pre-allocate */
XLogRecPtr MyLastRecPtr = {0, 0};
......@@ -178,8 +185,8 @@ typedef struct BkpBlock
/*
* We break each log file in 16Mb segments
*/
#define XLogSegSize (16*1024*1024)
#define XLogLastSeg (0xffffffff / XLogSegSize)
#define XLogSegSize ((uint32) (16*1024*1024))
#define XLogLastSeg (((uint32) 0xffffffff) / XLogSegSize)
#define XLogFileSize (XLogLastSeg * XLogSegSize)
#define NextLogSeg(_logId, _logSeg) \
......@@ -423,7 +430,7 @@ begin:;
}
}
}
S_LOCK_SLEEP(&(XLogCtl->insert_lck), i++);
S_LOCK_SLEEP(&(XLogCtl->insert_lck), i++, XLOG_LOCK_TIMEOUT);
if (!TAS(&(XLogCtl->insert_lck)))
break;
}
......@@ -721,7 +728,7 @@ XLogFlush(XLogRecPtr record)
break;
}
}
S_LOCK_SLEEP(&(XLogCtl->lgwr_lck), spins++);
S_LOCK_SLEEP(&(XLogCtl->lgwr_lck), spins++, XLOG_LOCK_TIMEOUT);
}
if (logFile >= 0 && (LgwrResult.Write.xlogid != logId ||
......@@ -741,7 +748,7 @@ XLogFlush(XLogRecPtr record)
logFile = XLogFileOpen(logId, logSeg, false);
}
if (pg_fsync(logFile) != 0)
if (pg_fdatasync(logFile) != 0)
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
logId, logSeg);
LgwrResult.Flush = LgwrResult.Write;
......@@ -826,7 +833,7 @@ GetFreeXLBuffer()
InitXLBuffer(curridx);
return;
}
S_LOCK_SLEEP(&(XLogCtl->lgwr_lck), spins++);
S_LOCK_SLEEP(&(XLogCtl->lgwr_lck), spins++, XLOG_LOCK_TIMEOUT);
}
}
......@@ -846,7 +853,7 @@ XLogWrite(char *buffer)
{
if (wcnt > 0)
{
if (pg_fsync(logFile) != 0)
if (pg_fdatasync(logFile) != 0)
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
logId, logSeg);
if (LgwrResult.Write.xlogid != logId)
......@@ -928,7 +935,7 @@ XLogWrite(char *buffer)
if (XLByteLT(LgwrResult.Flush, LgwrRqst.Flush) &&
XLByteLE(LgwrRqst.Flush, LgwrResult.Write))
{
if (pg_fsync(logFile) != 0)
if (pg_fdatasync(logFile) != 0)
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
logId, logSeg);
LgwrResult.Flush = LgwrResult.Write;
......@@ -948,13 +955,14 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent)
{
char path[MAXPGPATH];
char tpath[MAXPGPATH];
char zbuffer[BLCKSZ];
int fd;
int nbytes;
XLogFileName(path, log, seg);
/*
* Try to use existent file (checkpoint maker
* creates it sometime).
* Try to use existent file (checkpoint maker creates it sometimes).
*/
if (*usexistent)
{
......@@ -963,7 +971,7 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent)
{
if (errno != ENOENT)
elog(STOP, "InitOpen(logfile %u seg %u) failed: %m",
logId, logSeg);
logId, logSeg);
}
else
return(fd);
......@@ -979,33 +987,44 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent)
elog(STOP, "InitCreate(logfile %u seg %u) failed: %m",
logId, logSeg);
if (lseek(fd, XLogSegSize - 1, SEEK_SET) != (off_t) (XLogSegSize - 1))
elog(STOP, "lseek(logfile %u seg %u) failed: %m",
logId, logSeg);
if (write(fd, "", 1) != 1)
elog(STOP, "write(logfile %u seg %u) failed: %m",
logId, logSeg);
/*
* Zero-fill the file. We have to do this the hard way to ensure that
* all the file space has really been allocated --- on platforms that
* allow "holes" in files, just seeking to the end doesn't allocate
* intermediate space. This way, we know that we have all the space
* and (after the fsync below) that all the indirect blocks are down
* on disk. Therefore, fdatasync(2) will be sufficient to sync future
* writes to the log file.
*/
MemSet(zbuffer, 0, sizeof(zbuffer));
for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
{
if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
elog(STOP, "ZeroFill(logfile %u seg %u) failed: %m",
logId, logSeg);
}
if (pg_fsync(fd) != 0)
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
logId, logSeg);
if (lseek(fd, 0, SEEK_SET) < 0)
elog(STOP, "lseek(logfile %u seg %u off %u) failed: %m",
log, seg, 0);
close(fd);
/*
* Prefer link() to rename() here just to be sure that we don't overwrite
* an existing logfile. However, there shouldn't be one, so rename()
* is an acceptable substitute except for the truly paranoid.
*/
#ifndef __BEOS__
if (link(tpath, path) < 0)
elog(STOP, "InitRelink(logfile %u seg %u) failed: %m",
logId, logSeg);
unlink(tpath);
#else
if (rename(tpath, path) < 0)
#endif
elog(STOP, "InitRelink(logfile %u seg %u) failed: %m",
logId, logSeg);
unlink(tpath);
#endif
fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
if (fd < 0)
......@@ -2101,7 +2120,8 @@ CreateCheckPoint(bool shutdown)
/* Grab lock, using larger than normal sleep between tries (1 sec) */
while (TAS(&(XLogCtl->chkp_lck)))
{
S_LOCK_SLEEP_INTERVAL(&(XLogCtl->chkp_lck), spins++, 1000000);
S_LOCK_SLEEP_INTERVAL(&(XLogCtl->chkp_lck), spins++,
CHECKPOINT_LOCK_TIMEOUT, 1000000);
}
memset(&checkPoint, 0, sizeof(checkPoint));
......
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.106 2001/01/24 19:43:05 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.107 2001/02/18 04:39:42 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -1990,6 +1990,9 @@ UnlockBuffers(void)
}
}
/* Max time to wait to acquire a buffer read or write lock */
#define BUFFER_LOCK_TIMEOUT (10*60*1000000) /* 10 minutes */
void
LockBuffer(Buffer buffer, int mode)
{
......@@ -2041,7 +2044,7 @@ LockBuffer(Buffer buffer, int mode)
{
S_UNLOCK(&(buf->cntx_lock));
RESUME_INTERRUPTS();
S_LOCK_SLEEP(&(buf->cntx_lock), i++);
S_LOCK_SLEEP(&(buf->cntx_lock), i++, BUFFER_LOCK_TIMEOUT);
HOLD_INTERRUPTS();
S_LOCK(&(buf->cntx_lock));
}
......@@ -2069,7 +2072,7 @@ LockBuffer(Buffer buffer, int mode)
}
S_UNLOCK(&(buf->cntx_lock));
RESUME_INTERRUPTS();
S_LOCK_SLEEP(&(buf->cntx_lock), i++);
S_LOCK_SLEEP(&(buf->cntx_lock), i++, BUFFER_LOCK_TIMEOUT);
HOLD_INTERRUPTS();
S_LOCK(&(buf->cntx_lock));
}
......
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/s_lock.c,v 1.32 2001/01/24 19:43:06 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/s_lock.c,v 1.33 2001/02/18 04:39:42 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -21,23 +21,39 @@
#include "storage/s_lock.h"
/*
/*----------
* Each time we busy spin we select the next element of this array as the
* number of microseconds to wait. This accomplishes pseudo random back-off.
* Values are not critical but 10 milliseconds is a common platform
* granularity.
*
* Total time to cycle through all 20 entries might be about .07 sec,
* so the given value of S_MAX_BUSY results in timeout after ~70 sec.
* Note that on most platforms, specified values will be rounded up to the
* next multiple of a clock tick, which is often ten milliseconds (10000).
* So, we are being way overoptimistic to assume that these different values
* are really different, other than the last. But there are a few platforms
* with better-than-usual timekeeping, and on these we will get pretty good
* pseudo-random behavior.
*
* Total time to cycle through all 20 entries will be at least 100 msec,
* more commonly (10 msec resolution) 220 msec, and on some platforms
* as much as 420 msec (when the remainder of the current tick cycle is
* ignored in deciding when to time out, as on FreeBSD and older Linuxen).
* We use the 100msec figure to figure max_spins, so actual timeouts may
* be as much as four times the nominal value, but will never be less.
*----------
*/
#define S_NSPINCYCLE 20
#define S_MAX_BUSY 1000 * S_NSPINCYCLE
int s_spincycle[S_NSPINCYCLE] =
{ 0, 0, 0, 0, 10000, 0, 0, 0, 10000, 0,
0, 10000, 0, 0, 10000, 0, 10000, 0, 10000, 10000
{ 1, 10, 100, 1000,
10000, 1000, 1000, 1000,
10000, 1000, 1000, 10000,
1000, 1000, 10000, 1000,
10000, 1000, 10000, 30000
};
#define AVG_SPINCYCLE 5000 /* average entry in microsec: 100ms / 20 */
#define DEFAULT_TIMEOUT (100*1000000) /* default timeout: 100 sec */
/*
* s_lock_stuck() - complain about a stuck spinlock
......@@ -58,34 +74,40 @@ s_lock_stuck(volatile slock_t *lock, const char *file, const int line)
/*
* s_lock_sleep() - sleep a pseudo-random amount of time, check for timeout
*
* Normally 'microsec' is 0, specifying to use the next s_spincycle[] value.
* The 'timeout' is given in microsec, or may be 0 for "infinity". Note that
* this will be a lower bound (a fairly loose lower bound, on most platforms).
*
* 'microsec' is the number of microsec to delay per loop. Normally
* 'microsec' is 0, specifying to use the next s_spincycle[] value.
* Some callers may pass a nonzero interval, specifying to use exactly that
* delay value rather than a pseudo-random delay.
*/
void
s_lock_sleep(unsigned spins, int microsec,
s_lock_sleep(unsigned spins, int timeout, int microsec,
volatile slock_t *lock,
const char *file, const int line)
{
struct timeval delay;
unsigned max_spins;
if (microsec > 0)
{
delay.tv_sec = 0;
delay.tv_usec = microsec;
/* two-minute timeout in this case */
max_spins = 120000000 / microsec;
}
else
{
delay.tv_sec = 0;
delay.tv_usec = s_spincycle[spins % S_NSPINCYCLE];
max_spins = S_MAX_BUSY;
microsec = AVG_SPINCYCLE; /* use average to figure timeout */
}
if (spins > max_spins)
s_lock_stuck(lock, file, line);
if (timeout > 0)
{
unsigned max_spins = timeout / microsec;
if (spins > max_spins)
s_lock_stuck(lock, file, line);
}
(void) select(0, NULL, NULL, NULL, &delay);
}
......@@ -110,7 +132,7 @@ s_lock(volatile slock_t *lock, const char *file, const int line)
*/
while (TAS(lock))
{
s_lock_sleep(spins++, 0, lock, file, line);
s_lock_sleep(spins++, DEFAULT_TIMEOUT, 0, lock, file, line);
CHECK_FOR_INTERRUPTS();
}
}
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.72 2001/02/17 01:00:04 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.73 2001/02/18 04:39:42 tgl Exp $
*
* NOTES:
*
......@@ -193,7 +193,7 @@ static char *filepath(char *filename);
static long pg_nofile(void);
/*
* pg_fsync --- same as fsync except does nothing if -F switch was given
* pg_fsync --- same as fsync except does nothing if enableFsync is off
*/
int
pg_fsync(int fd)
......@@ -204,6 +204,26 @@ pg_fsync(int fd)
return 0;
}
/*
* pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
*
* Not all platforms have fdatasync; treat as fsync if not available.
*/
int
pg_fdatasync(int fd)
{
if (enableFsync)
{
#ifdef HAVE_FDATASYNC
return fdatasync(fd);
#else
return fsync(fd);
#endif
}
else
return 0;
}
/*
* BasicOpenFile --- same as open(2) except can free other FDs if needed
*
......
......@@ -8,7 +8,7 @@
* or in config.h afterwards. Of course, if you edit config.h, then your
* changes will be overwritten the next time you run configure.
*
* $Id: config.h.in,v 1.157 2001/01/22 23:28:52 tgl Exp $
* $Id: config.h.in,v 1.158 2001/02/18 04:39:42 tgl Exp $
*/
#ifndef CONFIG_H
......@@ -548,6 +548,19 @@ extern void srandom(unsigned int seed);
*/
#define MAX_RANDOM_VALUE (0x7FFFFFFF)
/* Define if you have dlopen() */
#undef HAVE_DLOPEN
/* Define if you have fdatasync() */
#undef HAVE_FDATASYNC
/* Define if the standard header unistd.h declares fdatasync() */
#undef HAVE_FDATASYNC_DECL
#if defined(HAVE_FDATASYNC) && !defined(HAVE_FDATASYNC_DECL)
extern int fdatasync(int fildes);
#endif
/* Set to 1 if you have libz.a */
#undef HAVE_LIBZ
......@@ -611,9 +624,6 @@ extern void srandom(unsigned int seed);
/* Define if C++ compiler accepts "#include <string>" */
#undef HAVE_CXX_STRING_HEADER
/* Define if you have dlopen() */
#undef HAVE_DLOPEN
/* Define if you have the optreset variable */
#undef HAVE_INT_OPTRESET
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: fd.h,v 1.26 2001/01/24 19:43:27 momjian Exp $
* $Id: fd.h,v 1.27 2001/02/18 04:39:42 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -71,5 +71,6 @@ extern int BasicOpenFile(FileName fileName, int fileFlags, int fileMode);
extern void closeAllVfds(void);
extern void AtEOXact_Files(void);
extern int pg_fsync(int fd);
extern int pg_fdatasync(int fd);
#endif /* FD_H */
......@@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/include/storage/s_lock.h,v 1.89 2001/02/16 23:50:40 tgl Exp $
* $Header: /cvsroot/pgsql/src/include/storage/s_lock.h,v 1.90 2001/02/18 04:39:42 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -48,11 +48,12 @@
* unsigned spins = 0;
*
* while (TAS(lock))
* S_LOCK_SLEEP(lock, spins++);
* S_LOCK_SLEEP(lock, spins++, timeout);
* }
*
* where S_LOCK_SLEEP() checks for timeout and sleeps for a short
* interval. Callers that want to perform useful work while waiting
* interval. (The timeout is expressed in microseconds, or can be 0 for
* "infinity".) Callers that want to perform useful work while waiting
* can write out this entire loop and insert the "useful work" inside
* the loop.
*
......@@ -86,7 +87,7 @@
/* Platform-independent out-of-line support routines */
extern void s_lock(volatile slock_t *lock,
const char *file, const int line);
extern void s_lock_sleep(unsigned spins, int microsec,
extern void s_lock_sleep(unsigned spins, int timeout, int microsec,
volatile slock_t *lock,
const char *file, const int line);
......@@ -518,13 +519,13 @@ extern int tas_sema(volatile slock_t *lock);
#endif /* S_LOCK */
#if !defined(S_LOCK_SLEEP)
#define S_LOCK_SLEEP(lock,spins) \
s_lock_sleep((spins), 0, (lock), __FILE__, __LINE__)
#define S_LOCK_SLEEP(lock,spins,timeout) \
s_lock_sleep((spins), (timeout), 0, (lock), __FILE__, __LINE__)
#endif /* S_LOCK_SLEEP */
#if !defined(S_LOCK_SLEEP_INTERVAL)
#define S_LOCK_SLEEP_INTERVAL(lock,spins,microsec) \
s_lock_sleep((spins), (microsec), (lock), __FILE__, __LINE__)
#define S_LOCK_SLEEP_INTERVAL(lock,spins,timeout,microsec) \
s_lock_sleep((spins), (timeout), (microsec), (lock), __FILE__, __LINE__)
#endif /* S_LOCK_SLEEP_INTERVAL */
#if !defined(S_LOCK_FREE)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment