Commit ea551608 authored by Heikki Linnakangas's avatar Heikki Linnakangas

In walsender, don't sleep if there's outstanding WAL waiting to be sent,

otherwise we effectively rate-limit the streaming as pointed out by
Simon Riggs. Also, send the WAL in smaller chunks, to respond to signals
more promptly.
parent 4ed4b6c5
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/replication/walsender.c,v 1.20 2010/05/09 18:11:55 tgl Exp $ * $PostgreSQL: pgsql/src/backend/replication/walsender.c,v 1.21 2010/05/26 22:21:33 heikki Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -100,13 +100,19 @@ static void InitWalSnd(void); ...@@ -100,13 +100,19 @@ static void InitWalSnd(void);
static void WalSndHandshake(void); static void WalSndHandshake(void);
static void WalSndKill(int code, Datum arg); static void WalSndKill(int code, Datum arg);
static void XLogRead(char *buf, XLogRecPtr recptr, Size nbytes); static void XLogRead(char *buf, XLogRecPtr recptr, Size nbytes);
static bool XLogSend(StringInfo outMsg); static bool XLogSend(StringInfo outMsg, bool *caughtup);
static void CheckClosedConnection(void); static void CheckClosedConnection(void);
/* /*
* How much WAL to send in one message? Must be >= XLOG_BLCKSZ. * How much WAL to send in one message? Must be >= XLOG_BLCKSZ.
*
* We don't have a good idea of what a good value would be; there's some
* overhead per message in both walsender and walreceiver, but on the other
* hand sending large batches makes walsender less responsive to signals
* because signals are checked only between messages. 128kB seems like
* a reasonable guess for now.
*/ */
#define MAX_SEND_SIZE (XLOG_SEG_SIZE / 2) #define MAX_SEND_SIZE (128 * 1024)
/* Main entry point for walsender process */ /* Main entry point for walsender process */
int int
...@@ -360,6 +366,7 @@ static int ...@@ -360,6 +366,7 @@ static int
WalSndLoop(void) WalSndLoop(void)
{ {
StringInfoData output_message; StringInfoData output_message;
bool caughtup = false;
initStringInfo(&output_message); initStringInfo(&output_message);
...@@ -387,7 +394,7 @@ WalSndLoop(void) ...@@ -387,7 +394,7 @@ WalSndLoop(void)
*/ */
if (ready_to_stop) if (ready_to_stop)
{ {
XLogSend(&output_message); XLogSend(&output_message, &caughtup);
shutdown_requested = true; shutdown_requested = true;
} }
...@@ -402,31 +409,32 @@ WalSndLoop(void) ...@@ -402,31 +409,32 @@ WalSndLoop(void)
} }
/* /*
* Nap for the configured time or until a message arrives. * If we had sent all accumulated WAL in last round, nap for the
* configured time before retrying.
* *
* On some platforms, signals won't interrupt the sleep. To ensure we * On some platforms, signals won't interrupt the sleep. To ensure we
* respond reasonably promptly when someone signals us, break down the * respond reasonably promptly when someone signals us, break down the
* sleep into NAPTIME_PER_CYCLE increments, and check for * sleep into NAPTIME_PER_CYCLE increments, and check for
* interrupts after each nap. * interrupts after each nap.
*/ */
remain = WalSndDelay * 1000L; if (caughtup)
while (remain > 0)
{ {
if (got_SIGHUP || shutdown_requested || ready_to_stop) remain = WalSndDelay * 1000L;
break; while (remain > 0)
{
/* Check for interrupts */
if (got_SIGHUP || shutdown_requested || ready_to_stop)
break;
/* /* Sleep and check that the connection is still alive */
* Check to see whether a message from the standby or an interrupt pg_usleep(remain > NAPTIME_PER_CYCLE ? NAPTIME_PER_CYCLE : remain);
* from other processes has arrived. CheckClosedConnection();
*/
pg_usleep(remain > NAPTIME_PER_CYCLE ? NAPTIME_PER_CYCLE : remain);
CheckClosedConnection();
remain -= NAPTIME_PER_CYCLE; remain -= NAPTIME_PER_CYCLE;
}
} }
/* Attempt to send the log once every loop */ /* Attempt to send the log once every loop */
if (!XLogSend(&output_message)) if (!XLogSend(&output_message, &caughtup))
goto eof; goto eof;
} }
...@@ -623,15 +631,20 @@ XLogRead(char *buf, XLogRecPtr recptr, Size nbytes) ...@@ -623,15 +631,20 @@ XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
} }
/* /*
* Read all WAL that's been written (and flushed) since last cycle, and send * Read up to MAX_SEND_SIZE bytes of WAL that's been written (and flushed),
* it to client. * but not yet sent to the client, and send it. If there is no unsent WAL,
* *caughtup is set to true and nothing is sent, otherwise *caughtup is set
* to false.
* *
* Returns true if OK, false if trouble. * Returns true if OK, false if trouble.
*/ */
static bool static bool
XLogSend(StringInfo outMsg) XLogSend(StringInfo outMsg, bool *caughtup)
{ {
XLogRecPtr SendRqstPtr; XLogRecPtr SendRqstPtr;
XLogRecPtr startptr;
XLogRecPtr endptr;
Size nbytes;
char activitymsg[50]; char activitymsg[50];
/* use volatile pointer to prevent code rearrangement */ /* use volatile pointer to prevent code rearrangement */
...@@ -642,84 +655,82 @@ XLogSend(StringInfo outMsg) ...@@ -642,84 +655,82 @@ XLogSend(StringInfo outMsg)
/* Quick exit if nothing to do */ /* Quick exit if nothing to do */
if (!XLByteLT(sentPtr, SendRqstPtr)) if (!XLByteLT(sentPtr, SendRqstPtr))
{
*caughtup = true;
return true; return true;
}
/*
* Otherwise let the caller know that we're not fully caught up. Unless
* there's a huge backlog, we'll be caught up to the current WriteRecPtr
* after we've sent everything below, but more WAL could accumulate while
* we're busy sending.
*/
*caughtup = false;
/* /*
* We gather multiple records together by issuing just one XLogRead() of a * Figure out how much to send in one message. If there's less than
* suitable size, and send them as one CopyData message. Repeat until * MAX_SEND_SIZE bytes to send, send everything. Otherwise send
* we've sent everything we can. * MAX_SEND_SIZE bytes, but round to page boundary.
*
* The rounding is not only for performance reasons. Walreceiver
* relies on the fact that we never split a WAL record across two
* messages. Since a long WAL record is split at page boundary into
* continuation records, page boundary is always a safe cut-off point.
* We also assume that SendRqstPtr never points in the middle of a WAL
* record.
*/ */
while (XLByteLT(sentPtr, SendRqstPtr)) startptr = sentPtr;
if (startptr.xrecoff >= XLogFileSize)
{ {
XLogRecPtr startptr;
XLogRecPtr endptr;
Size nbytes;
/* /*
* Figure out how much to send in one message. If there's less than * crossing a logid boundary, skip the non-existent last log
* MAX_SEND_SIZE bytes to send, send everything. Otherwise send * segment in previous logical log file.
* MAX_SEND_SIZE bytes, but round to page boundary.
*
* The rounding is not only for performance reasons. Walreceiver
* relies on the fact that we never split a WAL record across two
* messages. Since a long WAL record is split at page boundary into
* continuation records, page boundary is always a safe cut-off point.
* We also assume that SendRqstPtr never points in the middle of a WAL
* record.
*/ */
startptr = sentPtr; startptr.xlogid += 1;
if (startptr.xrecoff >= XLogFileSize) startptr.xrecoff = 0;
{ }
/*
* crossing a logid boundary, skip the non-existent last log
* segment in previous logical log file.
*/
startptr.xlogid += 1;
startptr.xrecoff = 0;
}
endptr = startptr; endptr = startptr;
XLByteAdvance(endptr, MAX_SEND_SIZE); XLByteAdvance(endptr, MAX_SEND_SIZE);
/* round down to page boundary. */ /* round down to page boundary. */
endptr.xrecoff -= (endptr.xrecoff % XLOG_BLCKSZ); endptr.xrecoff -= (endptr.xrecoff % XLOG_BLCKSZ);
/* if we went beyond SendRqstPtr, back off */ /* if we went beyond SendRqstPtr, back off */
if (XLByteLT(SendRqstPtr, endptr)) if (XLByteLT(SendRqstPtr, endptr))
endptr = SendRqstPtr; endptr = SendRqstPtr;
/* /*
* OK to read and send the slice. * OK to read and send the slice.
* *
* We don't need to convert the xlogid/xrecoff from host byte order to * We don't need to convert the xlogid/xrecoff from host byte order to
* network byte order because the both server can be expected to have * network byte order because the both server can be expected to have
* the same byte order. If they have different byte order, we don't * the same byte order. If they have different byte order, we don't
* reach here. * reach here.
*/ */
pq_sendbyte(outMsg, 'w'); pq_sendbyte(outMsg, 'w');
pq_sendbytes(outMsg, (char *) &startptr, sizeof(startptr)); pq_sendbytes(outMsg, (char *) &startptr, sizeof(startptr));
if (endptr.xlogid != startptr.xlogid) if (endptr.xlogid != startptr.xlogid)
{ {
Assert(endptr.xlogid == startptr.xlogid + 1); Assert(endptr.xlogid == startptr.xlogid + 1);
nbytes = endptr.xrecoff + XLogFileSize - startptr.xrecoff; nbytes = endptr.xrecoff + XLogFileSize - startptr.xrecoff;
} }
else else
nbytes = endptr.xrecoff - startptr.xrecoff; nbytes = endptr.xrecoff - startptr.xrecoff;
sentPtr = endptr; sentPtr = endptr;
/* /*
* Read the log directly into the output buffer to prevent extra * Read the log directly into the output buffer to prevent extra
* memcpy calls. * memcpy calls.
*/ */
enlargeStringInfo(outMsg, nbytes); enlargeStringInfo(outMsg, nbytes);
XLogRead(&outMsg->data[outMsg->len], startptr, nbytes); XLogRead(&outMsg->data[outMsg->len], startptr, nbytes);
outMsg->len += nbytes; outMsg->len += nbytes;
outMsg->data[outMsg->len] = '\0'; outMsg->data[outMsg->len] = '\0';
pq_putmessage('d', outMsg->data, outMsg->len); pq_putmessage('d', outMsg->data, outMsg->len);
resetStringInfo(outMsg); resetStringInfo(outMsg);
}
/* Update shared memory status */ /* Update shared memory status */
SpinLockAcquire(&walsnd->mutex); SpinLockAcquire(&walsnd->mutex);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment