Commit ad458cfe authored by Heikki Linnakangas's avatar Heikki Linnakangas

Don't use O_DIRECT when writing WAL files if archiving or streaming is

enabled. Bypassing the kernel cache is counter-productive in that case,
because the archiver/walsender process will read from the WAL file
soon after it's written, and if it's not cached the read will cause
a physical read, eating I/O bandwidth available on the WAL drive.

Also, walreceiver process does unaligned writes, so disable O_DIRECT
in walreceiver process for that reason too.
parent 94f610b1
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.376 2010/02/19 01:04:03 itagaki Exp $ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.377 2010/02/19 10:51:03 heikki Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -2686,13 +2686,10 @@ XLogFileClose(void) ...@@ -2686,13 +2686,10 @@ XLogFileClose(void)
* WAL segment files will not be re-read in normal operation, so we advise * WAL segment files will not be re-read in normal operation, so we advise
* the OS to release any cached pages. But do not do so if WAL archiving * the OS to release any cached pages. But do not do so if WAL archiving
* or streaming is active, because archiver and walsender process could use * or streaming is active, because archiver and walsender process could use
* the cache to read the WAL segment. Also, don't bother with it if we * the cache to read the WAL segment.
* are using O_DIRECT, since the kernel is presumably not caching in that
* case.
*/ */
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
if (!XLogIsNeeded() && if (!XLogIsNeeded())
(get_sync_bit(sync_method) & PG_O_DIRECT) == 0)
(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED); (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
#endif #endif
...@@ -7652,10 +7649,29 @@ xlog_outrec(StringInfo buf, XLogRecord *record) ...@@ -7652,10 +7649,29 @@ xlog_outrec(StringInfo buf, XLogRecord *record)
static int static int
get_sync_bit(int method) get_sync_bit(int method)
{ {
int o_direct_flag = 0;
/* If fsync is disabled, never open in sync mode */ /* If fsync is disabled, never open in sync mode */
if (!enableFsync) if (!enableFsync)
return 0; return 0;
/*
* Optimize writes by bypassing kernel cache with O_DIRECT when using
* O_SYNC, O_DSYNC or O_FSYNC. But only if archiving and streaming are
* disabled, otherwise the archive command or walsender process will
* read the WAL soon after writing it, which is guaranteed to cause a
* physical read if we bypassed the kernel cache. We also skip the
* posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the
* same reason.
*
* Never use O_DIRECT in walreceiver process for similar reasons; the WAL
* written by walreceiver is normally read by the startup process soon
* after its written. Also, walreceiver performs unaligned writes, which
* don't work with O_DIRECT, so it is required for correctness too.
*/
if (!XLogIsNeeded() && !am_walreceiver)
o_direct_flag = PG_O_DIRECT;
switch (method) switch (method)
{ {
/* /*
...@@ -7670,11 +7686,11 @@ get_sync_bit(int method) ...@@ -7670,11 +7686,11 @@ get_sync_bit(int method)
return 0; return 0;
#ifdef OPEN_SYNC_FLAG #ifdef OPEN_SYNC_FLAG
case SYNC_METHOD_OPEN: case SYNC_METHOD_OPEN:
return OPEN_SYNC_FLAG; return OPEN_SYNC_FLAG | o_direct_flag;
#endif #endif
#ifdef OPEN_DATASYNC_FLAG #ifdef OPEN_DATASYNC_FLAG
case SYNC_METHOD_OPEN_DSYNC: case SYNC_METHOD_OPEN_DSYNC:
return OPEN_DATASYNC_FLAG; return OPEN_DATASYNC_FLAG | o_direct_flag;
#endif #endif
default: default:
/* can't happen (unless we are out of sync with option array) */ /* can't happen (unless we are out of sync with option array) */
......
...@@ -29,7 +29,7 @@ ...@@ -29,7 +29,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/replication/walreceiver.c,v 1.4 2010/02/17 04:19:39 tgl Exp $ * $PostgreSQL: pgsql/src/backend/replication/walreceiver.c,v 1.5 2010/02/19 10:51:04 heikki Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -50,6 +50,9 @@ ...@@ -50,6 +50,9 @@
#include "utils/ps_status.h" #include "utils/ps_status.h"
#include "utils/resowner.h" #include "utils/resowner.h"
/* Global variable to indicate if this process is a walreceiver process */
bool am_walreceiver;
/* libpqreceiver hooks to these when loaded */ /* libpqreceiver hooks to these when loaded */
walrcv_connect_type walrcv_connect = NULL; walrcv_connect_type walrcv_connect = NULL;
walrcv_receive_type walrcv_receive = NULL; walrcv_receive_type walrcv_receive = NULL;
...@@ -158,6 +161,8 @@ WalReceiverMain(void) ...@@ -158,6 +161,8 @@ WalReceiverMain(void)
/* use volatile pointer to prevent code rearrangement */ /* use volatile pointer to prevent code rearrangement */
volatile WalRcvData *walrcv = WalRcv; volatile WalRcvData *walrcv = WalRcv;
am_walreceiver = true;
/* /*
* WalRcv should be set up already (if we are a backend, we inherit * WalRcv should be set up already (if we are a backend, we inherit
* this by fork() or EXEC_BACKEND mechanism from the postmaster). * this by fork() or EXEC_BACKEND mechanism from the postmaster).
...@@ -424,16 +429,18 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) ...@@ -424,16 +429,18 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr)
bool use_existent; bool use_existent;
/* /*
* XLOG segment files will be re-read in recovery operation soon, * fsync() and close current file before we switch to next one.
* so we don't need to advise the OS to release any cache page. * We would otherwise have to reopen this file to fsync it later
*/ */
if (recvFile >= 0) if (recvFile >= 0)
{ {
XLogWalRcvFlush();
/* /*
* fsync() before we switch to next file. We would otherwise * XLOG segment files will be re-read by recovery in startup
* have to reopen this file to fsync it later * process soon, so we don't advise the OS to release cache
* pages associated with the file like XLogFileClose() does.
*/ */
XLogWalRcvFlush();
if (close(recvFile) != 0) if (close(recvFile) != 0)
ereport(PANIC, ereport(PANIC,
(errcode_for_file_access(), (errcode_for_file_access(),
...@@ -445,8 +452,7 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) ...@@ -445,8 +452,7 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr)
/* Create/use new log file */ /* Create/use new log file */
XLByteToSeg(recptr, recvId, recvSeg); XLByteToSeg(recptr, recvId, recvSeg);
use_existent = true; use_existent = true;
recvFile = XLogFileInit(recvId, recvSeg, recvFile = XLogFileInit(recvId, recvSeg, &use_existent, true);
&use_existent, true);
recvOff = 0; recvOff = 0;
} }
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/access/xlogdefs.h,v 1.25 2010/01/15 09:19:06 heikki Exp $ * $PostgreSQL: pgsql/src/include/access/xlogdefs.h,v 1.26 2010/02/19 10:51:04 heikki Exp $
*/ */
#ifndef XLOG_DEFS_H #ifndef XLOG_DEFS_H
#define XLOG_DEFS_H #define XLOG_DEFS_H
...@@ -106,23 +106,20 @@ typedef uint32 TimeLineID; ...@@ -106,23 +106,20 @@ typedef uint32 TimeLineID;
* configure determined whether fdatasync() is. * configure determined whether fdatasync() is.
*/ */
#if defined(O_SYNC) #if defined(O_SYNC)
#define BARE_OPEN_SYNC_FLAG O_SYNC #define OPEN_SYNC_FLAG O_SYNC
#elif defined(O_FSYNC) #elif defined(O_FSYNC)
#define BARE_OPEN_SYNC_FLAG O_FSYNC #define OPEN_SYNC_FLAG O_FSYNC
#endif
#ifdef BARE_OPEN_SYNC_FLAG
#define OPEN_SYNC_FLAG (BARE_OPEN_SYNC_FLAG | PG_O_DIRECT)
#endif #endif
#if defined(O_DSYNC) #if defined(O_DSYNC)
#if defined(OPEN_SYNC_FLAG) #if defined(OPEN_SYNC_FLAG)
/* O_DSYNC is distinct? */ /* O_DSYNC is distinct? */
#if O_DSYNC != BARE_OPEN_SYNC_FLAG #if O_DSYNC != OPEN_SYNC_FLAG
#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT) #define OPEN_DATASYNC_FLAG O_DSYNC
#endif #endif
#else /* !defined(OPEN_SYNC_FLAG) */ #else /* !defined(OPEN_SYNC_FLAG) */
/* Win32 only has O_DSYNC */ /* Win32 only has O_DSYNC */
#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT) #define OPEN_DATASYNC_FLAG O_DSYNC
#endif #endif
#endif #endif
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
* *
* Portions Copyright (c) 2010-2010, PostgreSQL Global Development Group * Portions Copyright (c) 2010-2010, PostgreSQL Global Development Group
* *
* $PostgreSQL: pgsql/src/include/replication/walreceiver.h,v 1.6 2010/02/03 09:47:19 heikki Exp $ * $PostgreSQL: pgsql/src/include/replication/walreceiver.h,v 1.7 2010/02/19 10:51:04 heikki Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
#include "access/xlogdefs.h" #include "access/xlogdefs.h"
#include "storage/spin.h" #include "storage/spin.h"
extern bool am_walreceiver;
/* /*
* MAXCONNINFO: maximum size of a connection string. * MAXCONNINFO: maximum size of a connection string.
* *
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment