Commit 754baa21 authored by Heikki Linnakangas's avatar Heikki Linnakangas

Automatically terminate replication connections that are idle for more

than replication_timeout (a new GUC) milliseconds. The TCP timeout is often
too long, you want the master to notice a dead connection much sooner.
People complained about that in 9.0 too, but with synchronous replication
it's even more important to notice dead connections promptly.

Fujii Masao and Heikki Linnakangas
parent bc03c593
......@@ -2019,6 +2019,29 @@ SET ENABLE_SEQSCAN TO OFF;
</para>
</listitem>
</varlistentry>
<varlistentry id="guc-replication-timeout" xreflabel="replication_timeout">
<term><varname>replication_timeout</varname> (<type>integer</type>)</term>
<indexterm>
<primary><varname>replication_timeout</> configuration parameter</primary>
</indexterm>
<listitem>
<para>
Terminate replication connections that are inactive longer
than the specified number of milliseconds. This is useful for
the primary server to detect a standby crash or network outage.
A value of zero means wait forever. This parameter can only be set in
the <filename>postgresql.conf</> file or on the server command line.
The default value is 60 seconds.
</para>
<para>
To prevent connections from being terminated prematurely,
<xref linkend="guc-wal-receiver-status-interval">
must be enabled on the standby, and its value must be less than the
value of <varname>replication_timeout</>.
</para>
</listitem>
</varlistentry>
</variablelist>
</sect2>
......@@ -2216,6 +2239,11 @@ SET ENABLE_SEQSCAN TO OFF;
the <filename>postgresql.conf</> file or on the server command line.
The default value is 10 seconds.
</para>
<para>
When <xref linkend="guc-replication-timeout"> is enabled on the primary,
<varname>wal_receiver_status_interval</> must be enabled, and its value
must be less than the value of <varname>replication_timeout</>.
</para>
</listitem>
</varlistentry>
......
This diff is collapsed.
......@@ -193,19 +193,21 @@ DisownLatch(volatile Latch *latch)
bool
WaitLatch(volatile Latch *latch, long timeout)
{
return WaitLatchOrSocket(latch, PGINVALID_SOCKET, timeout) > 0;
return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
}
/*
* Like WaitLatch, but will also return when there's data available in
* 'sock' for reading. Returns 0 if timeout was reached, 1 if the latch
* was set, or 2 if the scoket became readable.
* 'sock' for reading or writing. Returns 0 if timeout was reached,
* 1 if the latch was set, 2 if the socket became readable or writable.
*/
int
WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, bool forRead,
bool forWrite, long timeout)
{
struct timeval tv, *tvp = NULL;
fd_set input_mask;
fd_set output_mask;
int rc;
int result = 0;
......@@ -241,14 +243,22 @@ WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
FD_ZERO(&input_mask);
FD_SET(selfpipe_readfd, &input_mask);
hifd = selfpipe_readfd;
if (sock != PGINVALID_SOCKET)
if (sock != PGINVALID_SOCKET && forRead)
{
FD_SET(sock, &input_mask);
if (sock > hifd)
hifd = sock;
}
rc = select(hifd + 1, &input_mask, NULL, NULL, tvp);
FD_ZERO(&output_mask);
if (sock != PGINVALID_SOCKET && forWrite)
{
FD_SET(sock, &output_mask);
if (sock > hifd)
hifd = sock;
}
rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp);
if (rc < 0)
{
if (errno == EINTR)
......@@ -263,7 +273,9 @@ WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout)
result = 0;
break;
}
if (sock != PGINVALID_SOCKET && FD_ISSET(sock, &input_mask))
if (sock != PGINVALID_SOCKET &&
((forRead && FD_ISSET(sock, &input_mask)) ||
(forWrite && FD_ISSET(sock, &output_mask))))
{
result = 2;
break; /* data available in socket */
......
......@@ -14,7 +14,8 @@
#include "postgres.h"
/*
* Indicate if pgwin32_recv() should operate in non-blocking mode.
* Indicate if pgwin32_recv() and pgwin32_send() should operate
* in non-blocking mode.
*
* Since the socket emulation layer always sets the actual socket to
* non-blocking mode in order to be able to deliver signals, we must
......@@ -399,6 +400,16 @@ pgwin32_send(SOCKET s, char *buf, int len, int flags)
return -1;
}
if (pgwin32_noblock)
{
/*
* No data sent, and we are in "emulated non-blocking mode", so
* return indicating that we'd block if we were to continue.
*/
errno = EWOULDBLOCK;
return -1;
}
/* No error, zero bytes (win2000+) or error+WSAEWOULDBLOCK (<=nt4) */
if (pgwin32_waitforsinglesocket(s, FD_WRITE | FD_CLOSE, INFINITE) == 0)
......
......@@ -85,11 +85,12 @@ DisownLatch(volatile Latch *latch)
bool
WaitLatch(volatile Latch *latch, long timeout)
{
return WaitLatchOrSocket(latch, PGINVALID_SOCKET, timeout) > 0;
return WaitLatchOrSocket(latch, PGINVALID_SOCKET, false, false, timeout) > 0;
}
int
WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, bool forRead,
bool forWrite, long timeout)
{
DWORD rc;
HANDLE events[3];
......@@ -103,10 +104,17 @@ WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
events[0] = latchevent;
events[1] = pgwin32_signal_event;
numevents = 2;
if (sock != PGINVALID_SOCKET)
if (sock != PGINVALID_SOCKET && (forRead || forWrite))
{
int flags = 0;
if (forRead)
flags |= FD_READ;
if (forWrite)
flags |= FD_WRITE;
sockevent = WSACreateEvent();
WSAEventSelect(sock, sockevent, FD_READ);
WSAEventSelect(sock, sockevent, flags);
events[numevents++] = sockevent;
}
......@@ -139,8 +147,18 @@ WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
pgwin32_dispatch_queued_signals();
else if (rc == WAIT_OBJECT_0 + 2)
{
WSANETWORKEVENTS resEvents;
Assert(sock != PGINVALID_SOCKET);
result = 2;
ZeroMemory(&resEvents, sizeof(resEvents));
if (WSAEnumNetworkEvents(sock, sockevent, &resEvents) == SOCKET_ERROR)
ereport(FATAL,
(errmsg_internal("failed to enumerate network events: %i", (int) GetLastError())));
if ((forRead && resEvents.lNetworkEvents & FD_READ) ||
(forWrite && resEvents.lNetworkEvents & FD_WRITE))
result = 2;
break;
}
else if (rc != WAIT_OBJECT_0)
......@@ -148,7 +166,7 @@ WaitLatchOrSocket(volatile Latch *latch, SOCKET sock, long timeout)
}
/* Clean up the handle we created for the socket */
if (sock != PGINVALID_SOCKET)
if (sock != PGINVALID_SOCKET && (forRead || forWrite))
{
WSAEventSelect(sock, sockevent, 0);
WSACloseEvent(sockevent);
......
......@@ -74,6 +74,7 @@ bool am_walsender = false; /* Am I a walsender process ? */
/* User-settable parameters for walsender */
int max_wal_senders = 0; /* the maximum number of concurrent walsenders */
int WalSndDelay = 1000; /* max sleep time between some actions */
int replication_timeout = 60 * 1000; /* maximum time to send one WAL data message */
/*
* These variables are used similarly to openLogFile/Id/Seg/Off,
......@@ -95,6 +96,11 @@ static XLogRecPtr sentPtr = {0, 0};
*/
static StringInfoData reply_message;
/*
* Timestamp of the last receipt of the reply from the standby.
*/
static TimestampTz last_reply_timestamp;
/* Flags set by signal handlers for later service in main loop */
static volatile sig_atomic_t got_SIGHUP = false;
volatile sig_atomic_t walsender_shutdown_requested = false;
......@@ -113,7 +119,7 @@ static int WalSndLoop(void);
static void InitWalSnd(void);
static void WalSndHandshake(void);
static void WalSndKill(int code, Datum arg);
static bool XLogSend(char *msgbuf, bool *caughtup);
static void XLogSend(char *msgbuf, bool *caughtup);
static void IdentifySystem(void);
static void StartReplication(StartReplicationCmd * cmd);
static void ProcessStandbyMessage(void);
......@@ -469,6 +475,7 @@ ProcessRepliesIfAny(void)
{
unsigned char firstchar;
int r;
int received = false;
for (;;)
{
......@@ -484,7 +491,7 @@ ProcessRepliesIfAny(void)
if (r == 0)
{
/* no data available without blocking */
return;
break;
}
/* Handle the very limited subset of commands expected in this phase */
......@@ -495,6 +502,7 @@ ProcessRepliesIfAny(void)
*/
case 'd':
ProcessStandbyMessage();
received = true;
break;
/*
......@@ -510,6 +518,12 @@ ProcessRepliesIfAny(void)
firstchar)));
}
}
/*
* Save the last reply timestamp if we've received at least
* one reply.
*/
if (received)
last_reply_timestamp = GetCurrentTimestamp();
}
/*
......@@ -688,6 +702,9 @@ WalSndLoop(void)
*/
initStringInfo(&reply_message);
/* Initialize the last reply timestamp */
last_reply_timestamp = GetCurrentTimestamp();
/* Loop forever, unless we get an error */
for (;;)
{
......@@ -706,19 +723,6 @@ WalSndLoop(void)
SyncRepInitConfig();
}
/*
* When SIGUSR2 arrives, we send all outstanding logs up to the
* shutdown checkpoint record (i.e., the latest record) and exit.
*/
if (walsender_ready_to_stop)
{
if (!XLogSend(output_message, &caughtup))
break;
ProcessRepliesIfAny();
if (caughtup)
walsender_shutdown_requested = true;
}
/* Normal exit from the walsender is here */
if (walsender_shutdown_requested)
{
......@@ -730,11 +734,13 @@ WalSndLoop(void)
}
/*
* If we had sent all accumulated WAL in last round, nap for the
* configured time before retrying.
* If we don't have any pending data in the output buffer, try to
* send some more.
*/
if (caughtup)
if (!pq_is_send_pending())
{
XLogSend(output_message, &caughtup);
/*
* Even if we wrote all the WAL that was available when we started
* sending, more might have arrived while we were sending this
......@@ -742,28 +748,79 @@ WalSndLoop(void)
* received any signals from that time. Let's arm the latch
* again, and after that check that we're still up-to-date.
*/
ResetLatch(&MyWalSnd->latch);
if (!XLogSend(output_message, &caughtup))
break;
if (caughtup && !got_SIGHUP && !walsender_ready_to_stop && !walsender_shutdown_requested)
if (caughtup && !pq_is_send_pending())
{
/*
* XXX: We don't really need the periodic wakeups anymore,
* WaitLatchOrSocket should reliably wake up as soon as
* something interesting happens.
*/
ResetLatch(&MyWalSnd->latch);
/* Sleep */
WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
WalSndDelay * 1000L);
XLogSend(output_message, &caughtup);
}
}
else
/* Flush pending output to the client */
if (pq_flush_if_writable() != 0)
break;
/*
* When SIGUSR2 arrives, we send any outstanding logs up to the
* shutdown checkpoint record (i.e., the latest record) and exit.
*/
if (walsender_ready_to_stop && !pq_is_send_pending())
{
/* Attempt to send the log once every loop */
if (!XLogSend(output_message, &caughtup))
XLogSend(output_message, &caughtup);
ProcessRepliesIfAny();
if (caughtup && !pq_is_send_pending())
walsender_shutdown_requested = true;
}
if ((caughtup || pq_is_send_pending()) &&
!got_SIGHUP &&
!walsender_shutdown_requested)
{
TimestampTz finish_time;
long sleeptime;
/* Reschedule replication timeout */
if (replication_timeout > 0)
{
long secs;
int usecs;
finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp,
replication_timeout);
TimestampDifference(GetCurrentTimestamp(),
finish_time, &secs, &usecs);
sleeptime = secs * 1000 + usecs / 1000;
if (WalSndDelay < sleeptime)
sleeptime = WalSndDelay;
}
else
{
/*
* XXX: Without timeout, we don't really need the periodic
* wakeups anymore, WaitLatchOrSocket should reliably wake up
* as soon as something interesting happens.
*/
sleeptime = WalSndDelay;
}
/* Sleep */
WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
true, pq_is_send_pending(),
sleeptime * 1000L);
/* Check for replication timeout */
if (replication_timeout > 0 &&
GetCurrentTimestamp() >= finish_time)
{
/*
* Since typically expiration of replication timeout means
* communication problem, we don't send the error message
* to the standby.
*/
ereport(COMMERROR,
(errmsg("terminating walsender process due to replication timeout")));
break;
}
}
/*
......@@ -993,7 +1050,8 @@ XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
/*
* Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
* but not yet sent to the client, and send it.
* but not yet sent to the client, and buffer it in the libpq output
* buffer.
*
* msgbuf is a work area in which the output message is constructed. It's
* passed in just so we can avoid re-palloc'ing the buffer on each cycle.
......@@ -1001,10 +1059,9 @@ XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
*
* If there is no unsent WAL remaining, *caughtup is set to true, otherwise
* *caughtup is set to false.
*
* Returns true if OK, false if trouble.
*/
static bool
static void
XLogSend(char *msgbuf, bool *caughtup)
{
XLogRecPtr SendRqstPtr;
......@@ -1027,7 +1084,7 @@ XLogSend(char *msgbuf, bool *caughtup)
if (XLByteLE(SendRqstPtr, sentPtr))
{
*caughtup = true;
return true;
return;
}
/*
......@@ -1099,11 +1156,7 @@ XLogSend(char *msgbuf, bool *caughtup)
memcpy(msgbuf + 1, &msghdr, sizeof(WalDataMessageHeader));
pq_putmessage('d', msgbuf, 1 + sizeof(WalDataMessageHeader) + nbytes);
/* Flush pending output to the client */
if (pq_flush())
return false;
pq_putmessage_noblock('d', msgbuf, 1 + sizeof(WalDataMessageHeader) + nbytes);
sentPtr = endptr;
......@@ -1127,7 +1180,7 @@ XLogSend(char *msgbuf, bool *caughtup)
set_ps_display(activitymsg, false);
}
return true;
return;
}
/* SIGHUP: set flag to re-read config file at next convenient time */
......
......@@ -1855,6 +1855,16 @@ static struct config_int ConfigureNamesInt[] =
1000, 1, 10000, NULL, NULL
},
{
{"replication_timeout", PGC_SIGHUP, WAL_REPLICATION,
gettext_noop("Sets the maximum time to wait for WAL replication."),
NULL,
GUC_UNIT_MS
},
&replication_timeout,
60 * 1000, 0, INT_MAX, NULL, NULL
},
{
{"commit_delay", PGC_USERSET, WAL_SETTINGS,
gettext_noop("Sets the delay in microseconds between transaction commit and "
......
......@@ -200,6 +200,7 @@
#wal_sender_delay = 1s # walsender cycle time, 1-10000 milliseconds
#wal_keep_segments = 0 # in logfile segments, 16MB each; 0 disables
#vacuum_defer_cleanup_age = 0 # number of xacts by which cleanup is delayed
#replication_timeout = 60s # in milliseconds, 0 is disabled
# - Standby Servers -
......
......@@ -60,7 +60,10 @@ extern int pq_peekbyte(void);
extern int pq_getbyte_if_available(unsigned char *c);
extern int pq_putbytes(const char *s, size_t len);
extern int pq_flush(void);
extern int pq_flush_if_writable(void);
extern bool pq_is_send_pending(void);
extern int pq_putmessage(char msgtype, const char *s, size_t len);
extern void pq_putmessage_noblock(char msgtype, const char *s, size_t len);
extern void pq_startcopyout(void);
extern void pq_endcopyout(bool errorAbort);
......
......@@ -98,6 +98,7 @@ extern volatile sig_atomic_t walsender_ready_to_stop;
/* user-settable parameters */
extern int WalSndDelay;
extern int max_wal_senders;
extern int replication_timeout;
extern int WalSenderMain(void);
extern void WalSndSignals(void);
......
......@@ -40,7 +40,7 @@ extern void OwnLatch(volatile Latch *latch);
extern void DisownLatch(volatile Latch *latch);
extern bool WaitLatch(volatile Latch *latch, long timeout);
extern int WaitLatchOrSocket(volatile Latch *latch, pgsocket sock,
long timeout);
bool forRead, bool forWrite, long timeout);
extern void SetLatch(volatile Latch *latch);
extern void ResetLatch(volatile Latch *latch);
#define TestLatch(latch) (((volatile Latch *) latch)->is_set)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment