Commit 9d645fd8 authored by Tom Lane's avatar Tom Lane

Support syncing WAL log to disk using either fsync(), fdatasync(),

O_SYNC, or O_DSYNC (as available on a given platform).  Add GUC parameter
to control sync method.
Also, add defense to XLogWrite to prevent it from going nuts if passed
a target write position that's past the end of the buffers so far filled
by XLogInsert.
parent 4eb5e27a
<!--
$Header: /cvsroot/pgsql/doc/src/sgml/runtime.sgml,v 1.56 2001/03/13 01:17:05 tgl Exp $
$Header: /cvsroot/pgsql/doc/src/sgml/runtime.sgml,v 1.57 2001/03/16 05:44:33 tgl Exp $
-->
<Chapter Id="runtime">
......@@ -1224,8 +1224,8 @@ env PGOPTIONS='-c geqo=off' psql
<term>WAL_BUFFERS (<type>integer</type>)</term>
<listitem>
<para>
Number of disk-page buffers for WAL log. This option can only be set
at server start.
Number of disk-page buffers in shared memory for WAL log.
This option can only be set at server start.
</para>
</listitem>
</varlistentry>
......@@ -1250,6 +1250,23 @@ env PGOPTIONS='-c geqo=off' psql
</para>
</listitem>
</varlistentry>
<varlistentry>
<term>WAL_SYNC_METHOD (<type>string</type>)</term>
<listitem>
<para>
Method used for forcing WAL updates out to disk. Possible
values are
<literal>FSYNC</> (call fsync() at each commit),
<literal>FDATASYNC</> (call fdatasync() at each commit),
<literal>OPEN_SYNC</> (write WAL files with open() option O_SYNC), or
<literal>OPEN_DATASYNC</> (write WAL files with open() option O_DSYNC).
Not all of these choices are available on all platforms.
This option can only be set at server start or in the
<filename>postgresql.conf</filename> file.
</para>
</listitem>
</varlistentry>
</variablelist>
</para>
</sect2>
......
<!-- $Header: /cvsroot/pgsql/doc/src/sgml/wal.sgml,v 1.4 2001/03/13 01:17:05 tgl Exp $ -->
<!-- $Header: /cvsroot/pgsql/doc/src/sgml/wal.sgml,v 1.5 2001/03/16 05:44:33 tgl Exp $ -->
<chapter id="wal">
<title>Write-Ahead Logging (<acronym>WAL</acronym>)</title>
......@@ -281,15 +281,6 @@
<command>CHECKPOINT</command>.
</para>
<para>
Setting the <varname>WAL_DEBUG</varname> parameter to any non-zero
value will result in each <function>LogInsert</function> and
<function>LogFlush</function> <acronym>WAL</acronym> call being
logged to standard error. At present, it makes no difference what
the non-zero value is. This option may be replaced by a more
general mechanism in the future.
</para>
<para>
The <varname>COMMIT_DELAY</varname> parameter defines for how many
microseconds the backend will sleep after writing a commit
......@@ -304,6 +295,24 @@
ten milliseconds, so that any nonzero <varname>COMMIT_DELAY</varname>
setting between 1 and 10000 microseconds will have the same effect.
</para>
<para>
The <varname>WAL_SYNC_METHOD</varname> parameter determines how
Postgres will ask the kernel to force WAL updates out to disk.
All the options should be the same as far as reliability goes,
but it's quite platform-specific which one will be the fastest.
Note that this parameter is irrelevant if <varname>FSYNC</varname>
has been turned off.
</para>
<para>
Setting the <varname>WAL_DEBUG</varname> parameter to any non-zero
value will result in each <function>LogInsert</function> and
<function>LogFlush</function> <acronym>WAL</acronym> call being
logged to standard error. At present, it makes no difference what
the non-zero value is. This option may be replaced by a more
general mechanism in the future.
</para>
</sect1>
</chapter>
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.58 2001/03/14 20:23:04 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.59 2001/03/16 05:44:33 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -42,6 +42,47 @@
#include "miscadmin.h"
/*
* This chunk of hackery attempts to determine which file sync methods
* are available on the current platform, and to choose an appropriate
* default method. We assume that fsync() is always available, and that
* configure determined whether fdatasync() is.
*/
#define SYNC_METHOD_FSYNC 0
#define SYNC_METHOD_FDATASYNC 1
#define SYNC_METHOD_OPEN 2 /* used for both O_SYNC and O_DSYNC */
#if defined(O_SYNC)
# define OPEN_SYNC_FLAG O_SYNC
#else
# if defined(O_FSYNC)
# define OPEN_SYNC_FLAG O_FSYNC
# endif
#endif
#if defined(OPEN_SYNC_FLAG)
# if defined(O_DSYNC) && (O_DSYNC != OPEN_SYNC_FLAG)
# define OPEN_DATASYNC_FLAG O_DSYNC
# endif
#endif
#if defined(OPEN_DATASYNC_FLAG)
# define DEFAULT_SYNC_METHOD_STR "open_datasync"
# define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN
# define DEFAULT_SYNC_FLAGBIT OPEN_DATASYNC_FLAG
#else
# if defined(HAVE_FDATASYNC)
# define DEFAULT_SYNC_METHOD_STR "fdatasync"
# define DEFAULT_SYNC_METHOD SYNC_METHOD_FDATASYNC
# define DEFAULT_SYNC_FLAGBIT 0
# else
# define DEFAULT_SYNC_METHOD_STR "fsync"
# define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC
# define DEFAULT_SYNC_FLAGBIT 0
# endif
#endif
/* Max time to wait to acquire XLog activity locks */
#define XLOG_LOCK_TIMEOUT (5*60*1000000) /* 5 minutes */
/* Max time to wait to acquire checkpoint lock */
......@@ -52,10 +93,18 @@ int CheckPointSegments = 3;
int XLOGbuffers = 8;
int XLOGfiles = 0; /* how many files to pre-allocate during ckpt */
int XLOG_DEBUG = 0;
char *XLOG_sync_method = NULL;
const char XLOG_sync_method_default[] = DEFAULT_SYNC_METHOD_STR;
char XLOG_archive_dir[MAXPGPATH]; /* null string means delete 'em */
/* these are derived from XLOG_sync_method by assign_xlog_sync_method */
static int sync_method = DEFAULT_SYNC_METHOD;
static int open_sync_bit = DEFAULT_SYNC_FLAGBIT;
#define MinXLOGbuffers 4
#define XLOG_SYNC_BIT (enableFsync ? open_sync_bit : 0)
/*
* ThisStartUpID will be same in all backends --- it identifies current
......@@ -365,6 +414,7 @@ static void WriteControlFile(void);
static void ReadControlFile(void);
static char *str_time(time_t tnow);
static void xlog_outrec(char *buf, XLogRecord *record);
static void issue_xlog_fsync(void);
/*
......@@ -917,6 +967,15 @@ XLogWrite(XLogwrtRqst WriteRqst)
while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
{
/*
* Make sure we're not ahead of the insert process. This could
* happen if we're passed a bogus WriteRqst.Write that is past the
* end of the last page that's been initialized by
* AdvanceXLInsertBuffer.
*/
if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[Write->curridx]))
elog(STOP, "XLogWrite: write request is past end of log");
/* Advance LogwrtResult.Write to end of current buffer page */
LogwrtResult.Write = XLogCtl->xlblocks[Write->curridx];
ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);
......@@ -1004,9 +1063,7 @@ XLogWrite(XLogwrtRqst WriteRqst)
*/
if (openLogOff >= XLogSegSize && !ispartialpage)
{
if (pg_fdatasync(openLogFile) != 0)
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
openLogId, openLogSeg);
issue_xlog_fsync();
LogwrtResult.Flush = LogwrtResult.Write; /* end of current page */
}
......@@ -1030,24 +1087,24 @@ XLogWrite(XLogwrtRqst WriteRqst)
* we might have no open file or the wrong one. However, we do
* not need to fsync more than one file.
*/
if (openLogFile >= 0 &&
!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
if (sync_method != SYNC_METHOD_OPEN)
{
if (close(openLogFile) != 0)
elog(STOP, "close(logfile %u seg %u) failed: %m",
openLogId, openLogSeg);
openLogFile = -1;
}
if (openLogFile < 0)
{
XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
openLogFile = XLogFileOpen(openLogId, openLogSeg, false);
openLogOff = 0;
if (openLogFile >= 0 &&
!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
{
if (close(openLogFile) != 0)
elog(STOP, "close(logfile %u seg %u) failed: %m",
openLogId, openLogSeg);
openLogFile = -1;
}
if (openLogFile < 0)
{
XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
openLogFile = XLogFileOpen(openLogId, openLogSeg, false);
openLogOff = 0;
}
issue_xlog_fsync();
}
if (pg_fdatasync(openLogFile) != 0)
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
openLogId, openLogSeg);
LogwrtResult.Flush = LogwrtResult.Write;
}
......@@ -1191,7 +1248,8 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent)
*/
if (*usexistent)
{
fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
S_IRUSR | S_IWUSR);
if (fd < 0)
{
if (errno != ENOENT)
......@@ -1208,6 +1266,7 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent)
unlink(tpath);
unlink(path);
/* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */
fd = BasicOpenFile(tpath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
S_IRUSR | S_IWUSR);
if (fd < 0)
......@@ -1220,8 +1279,8 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent)
* allow "holes" in files, just seeking to the end doesn't allocate
* intermediate space. This way, we know that we have all the space
* and (after the fsync below) that all the indirect blocks are down
* on disk. Therefore, fdatasync(2) will be sufficient to sync future
* writes to the log file.
* on disk. Therefore, fdatasync(2) or O_DSYNC will be sufficient to
* sync future writes to the log file.
*/
MemSet(zbuffer, 0, sizeof(zbuffer));
for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
......@@ -1261,7 +1320,8 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent)
log, seg);
#endif
fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
S_IRUSR | S_IWUSR);
if (fd < 0)
elog(STOP, "InitReopen(logfile %u seg %u) failed: %m",
log, seg);
......@@ -1280,7 +1340,8 @@ XLogFileOpen(uint32 log, uint32 seg, bool econt)
XLogFileName(path, log, seg);
fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
S_IRUSR | S_IWUSR);
if (fd < 0)
{
if (econt && errno == ENOENT)
......@@ -1845,7 +1906,8 @@ WriteControlFile(void)
memset(buffer, 0, BLCKSZ);
memcpy(buffer, ControlFile, sizeof(ControlFileData));
fd = BasicOpenFile(ControlFilePath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR);
fd = BasicOpenFile(ControlFilePath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
S_IRUSR | S_IWUSR);
if (fd < 0)
elog(STOP, "WriteControlFile failed to create control file (%s): %m",
ControlFilePath);
......@@ -2852,3 +2914,120 @@ xlog_outrec(char *buf, XLogRecord *record)
sprintf(buf + strlen(buf), ": %s",
RmgrTable[record->xl_rmid].rm_name);
}
/*
* GUC support routines
*/
bool
check_xlog_sync_method(const char *method)
{
if (strcasecmp(method, "fsync") == 0) return true;
#ifdef HAVE_FDATASYNC
if (strcasecmp(method, "fdatasync") == 0) return true;
#endif
#ifdef OPEN_SYNC_FLAG
if (strcasecmp(method, "open_sync") == 0) return true;
#endif
#ifdef OPEN_DATASYNC_FLAG
if (strcasecmp(method, "open_datasync") == 0) return true;
#endif
return false;
}
void
assign_xlog_sync_method(const char *method)
{
int new_sync_method;
int new_sync_bit;
if (strcasecmp(method, "fsync") == 0)
{
new_sync_method = SYNC_METHOD_FSYNC;
new_sync_bit = 0;
}
#ifdef HAVE_FDATASYNC
else if (strcasecmp(method, "fdatasync") == 0)
{
new_sync_method = SYNC_METHOD_FDATASYNC;
new_sync_bit = 0;
}
#endif
#ifdef OPEN_SYNC_FLAG
else if (strcasecmp(method, "open_sync") == 0)
{
new_sync_method = SYNC_METHOD_OPEN;
new_sync_bit = OPEN_SYNC_FLAG;
}
#endif
#ifdef OPEN_DATASYNC_FLAG
else if (strcasecmp(method, "open_datasync") == 0)
{
new_sync_method = SYNC_METHOD_OPEN;
new_sync_bit = OPEN_DATASYNC_FLAG;
}
#endif
else
{
/* Can't get here unless guc.c screwed up */
elog(ERROR, "Bogus xlog sync method %s", method);
new_sync_method = 0; /* keep compiler quiet */
new_sync_bit = 0;
}
if (sync_method != new_sync_method || open_sync_bit != new_sync_bit)
{
/*
* To ensure that no blocks escape unsynced, force an fsync on
* the currently open log segment (if any). Also, if the open
* flag is changing, close the log file so it will be reopened
* (with new flag bit) at next use.
*/
if (openLogFile >= 0)
{
if (pg_fsync(openLogFile) != 0)
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
openLogId, openLogSeg);
if (open_sync_bit != new_sync_bit)
{
if (close(openLogFile) != 0)
elog(STOP, "close(logfile %u seg %u) failed: %m",
openLogId, openLogSeg);
openLogFile = -1;
}
}
sync_method = new_sync_method;
open_sync_bit = new_sync_bit;
}
}
/*
* Issue appropriate kind of fsync (if any) on the current XLOG output file
*/
static void
issue_xlog_fsync(void)
{
switch (sync_method)
{
case SYNC_METHOD_FSYNC:
if (pg_fsync(openLogFile) != 0)
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
openLogId, openLogSeg);
break;
#ifdef HAVE_FDATASYNC
case SYNC_METHOD_FDATASYNC:
if (pg_fdatasync(openLogFile) != 0)
elog(STOP, "fdatasync(logfile %u seg %u) failed: %m",
openLogId, openLogSeg);
break;
#endif
case SYNC_METHOD_OPEN:
/* write synced it already */
break;
default:
elog(STOP, "bogus sync_method %d", sync_method);
break;
}
}
......@@ -4,7 +4,7 @@
* Support for grand unified configuration scheme, including SET
* command, configuration file, and command line options.
*
* $Header: /cvsroot/pgsql/src/backend/utils/misc/guc.c,v 1.32 2001/03/13 01:17:06 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/utils/misc/guc.c,v 1.33 2001/03/16 05:44:33 tgl Exp $
*
* Copyright 2000 by PostgreSQL Global Development Group
* Written by Peter Eisentraut <peter_e@gmx.net>.
......@@ -20,6 +20,7 @@
#include "utils/guc.h"
#include "access/xlog.h"
#include "commands/async.h"
#include "libpq/auth.h"
#include "libpq/pqcomm.h"
......@@ -33,23 +34,17 @@
#include "tcop/tcopprot.h"
/* XXX should be in a header file */
/* XXX these should be in other modules' header files */
extern bool Log_connections;
extern int CheckPointSegments;
extern int CheckPointTimeout;
extern int XLOGbuffers;
extern int XLOGfiles;
extern int XLOG_DEBUG;
extern int CommitDelay;
extern int CommitSiblings;
extern bool FixBTree;
#ifdef ENABLE_SYSLOG
extern char *Syslog_facility;
extern char *Syslog_ident;
bool check_facility(const char *facility);
static bool check_facility(const char *facility);
#endif
/*
......@@ -138,7 +133,8 @@ struct config_string
GucContext context;
char **variable;
const char *default_val;
bool (*parse_hook)(const char *);
bool (*parse_hook)(const char *proposed);
void (*assign_hook)(const char *newval);
};
......@@ -330,25 +326,29 @@ static struct config_string
ConfigureNamesString[] =
{
{"krb_server_keyfile", PGC_POSTMASTER, &pg_krb_server_keyfile,
PG_KRB_SRVTAB, NULL},
{"unix_socket_group", PGC_POSTMASTER, &Unix_socket_group,
"", NULL},
PG_KRB_SRVTAB, NULL, NULL},
#ifdef ENABLE_SYSLOG
{"syslog_facility", PGC_POSTMASTER, &Syslog_facility,
"LOCAL0", check_facility},
"LOCAL0", check_facility, NULL},
{"syslog_ident", PGC_POSTMASTER, &Syslog_ident,
"postgres", NULL},
"postgres", NULL, NULL},
#endif
{"unix_socket_group", PGC_POSTMASTER, &Unix_socket_group,
"", NULL, NULL},
{"unix_socket_directory", PGC_POSTMASTER, &UnixSocketDir,
"", NULL},
"", NULL, NULL},
{"virtual_host", PGC_POSTMASTER, &VirtualHost,
"", NULL},
"", NULL, NULL},
{NULL, 0, NULL, NULL, NULL}
{"wal_sync_method", PGC_SIGHUP, &XLOG_sync_method,
XLOG_sync_method_default,
check_xlog_sync_method, assign_xlog_sync_method},
{NULL, 0, NULL, NULL, NULL, NULL}
};
/******** end of options list ********/
......@@ -723,7 +723,10 @@ set_config_option(const char * name, const char * value, GucContext
elog(elevel, "out of memory");
return false;
}
free(*conf->variable);
if (conf->assign_hook)
(conf->assign_hook)(str);
if (*conf->variable)
free(*conf->variable);
*conf->variable = str;
}
}
......@@ -737,7 +740,10 @@ set_config_option(const char * name, const char * value, GucContext
elog(elevel, "out of memory");
return false;
}
free(*conf->variable);
if (conf->assign_hook)
(conf->assign_hook)(str);
if (*conf->variable)
free(*conf->variable);
*conf->variable = str;
}
break;
......@@ -855,7 +861,7 @@ ParseLongOption(const char * string, char ** name, char ** value)
#ifdef ENABLE_SYSLOG
bool
static bool
check_facility(const char *facility)
{
if (strcasecmp(facility,"LOCAL0") == 0) return true;
......
......@@ -107,6 +107,8 @@
#
#wal_buffers = 8 # min 4
#wal_files = 0 # range 0-64
#wal_sync_method = fsync # fsync or fdatasync or open_sync or open_datasync
# Note: default wal_sync_method varies across platforms
#wal_debug = 0 # range 0-16
#commit_delay = 0 # range 0-100000
#commit_siblings = 5 # range 1-1000
......
......@@ -6,7 +6,7 @@
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: xlog.h,v 1.20 2001/03/13 20:32:37 tgl Exp $
* $Id: xlog.h,v 1.21 2001/03/16 05:44:33 tgl Exp $
*/
#ifndef XLOG_H
#define XLOG_H
......@@ -176,6 +176,15 @@ extern StartUpID ThisStartUpID; /* current SUI */
extern bool InRecovery;
extern XLogRecPtr MyLastRecPtr;
/* these variables are GUC parameters related to XLOG */
extern int CheckPointSegments;
extern int XLOGbuffers;
extern int XLOGfiles;
extern int XLOG_DEBUG;
extern char *XLOG_sync_method;
extern const char XLOG_sync_method_default[];
extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata);
extern void XLogFlush(XLogRecPtr RecPtr);
......@@ -202,4 +211,7 @@ extern void GetRedoRecPtr(void);
*/
extern XLogRecPtr GetUndoRecPtr(void);
extern bool check_xlog_sync_method(const char *method);
extern void assign_xlog_sync_method(const char *method);
#endif /* XLOG_H */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment