Commit e652273e authored by Tom Lane's avatar Tom Lane

Redesign handling of SIGTERM/control-C in parallel pg_dump/pg_restore.

Formerly, Unix builds of pg_dump/pg_restore would trap SIGINT and similar
signals and set a flag that was tested in various data-transfer loops.
This was prone to errors of omission (cf commit 3c8aa665); and even if
the client-side response was prompt, we did nothing that would cause
long-running SQL commands (e.g. CREATE INDEX) to terminate early.
Also, the master process would effectively do nothing at all upon receipt
of SIGINT; the only reason it seemed to work was that in typical scenarios
the signal would also be delivered to the child processes.  We should
support termination when a signal is delivered only to the master process,
though.

Windows builds had no console interrupt handler, so they would just fall
over immediately at control-C, again leaving long-running SQL commands to
finish unmolested.

To fix, remove the flag-checking approach altogether.  Instead, allow the
Unix signal handler to send a cancel request directly and then exit(1).
In the master process, also have it forward the signal to the children.
On Windows, add a console interrupt handler that behaves approximately
the same.  The main difference is that a single execution of the Windows
handler can send all the cancel requests since all the info is available
in one process, whereas on Unix each process sends a cancel only for its
own database connection.

In passing, fix an old problem that DisconnectDatabase tends to send a
cancel request before exiting a parallel worker, even if nothing went
wrong.  This is at least a waste of cycles, and could lead to unexpected
log messages, or maybe even data loss if it happened in pg_restore (though
in the current code the problem seems to affect only pg_dump).  The cause
was that after a COPY step, pg_dump was leaving libpq in PGASYNC_BUSY
state, causing PQtransactionStatus() to report PQTRANS_ACTIVE.  That's
normally harmless because the next PQexec() will silently clear the
PGASYNC_BUSY state; but in a parallel worker we might exit without any
additional SQL commands after a COPY step.  So add an extra PQgetResult()
call after a COPY to allow libpq to return to PGASYNC_IDLE state.

This is a bug fix, IMO, so back-patch to 9.3 where parallel dump/restore
were introduced.

Thanks to Kyotaro Horiguchi for Windows testing and code suggestions.

Original-Patch: <7005.1464657274@sss.pgh.pa.us>
Discussion: <20160602.174941.256342236.horiguchi.kyotaro@lab.ntt.co.jp>
parent 7392eed7
......@@ -54,7 +54,6 @@
#include "postgres_fe.h"
#include "compress_io.h"
#include "parallel.h"
#include "pg_backup_utils.h"
/*----------------------
......@@ -184,9 +183,6 @@ void
WriteDataToArchive(ArchiveHandle *AH, CompressorState *cs,
const void *data, size_t dLen)
{
/* Are we aborting? */
checkAborting(AH);
switch (cs->comprAlg)
{
case COMPR_ALG_LIBZ:
......@@ -351,9 +347,6 @@ ReadDataFromArchiveZlib(ArchiveHandle *AH, ReadFunc readF)
/* no minimal chunk size for zlib */
while ((cnt = readF(AH, &buf, &buflen)))
{
/* Are we aborting? */
checkAborting(AH);
zp->next_in = (void *) buf;
zp->avail_in = cnt;
......@@ -414,9 +407,6 @@ ReadDataFromArchiveNone(ArchiveHandle *AH, ReadFunc readF)
while ((cnt = readF(AH, &buf, &buflen)))
{
/* Are we aborting? */
checkAborting(AH);
ahwrite(buf, 1, cnt, AH);
}
......
This diff is collapsed.
......@@ -82,6 +82,6 @@ extern void DispatchJobForTocEntry(ArchiveHandle *AH,
TocEntry *te, T_Action act);
extern void ParallelBackupEnd(ArchiveHandle *AH, ParallelState *pstate);
extern void checkAborting(ArchiveHandle *AH);
extern void set_archive_cancel_info(ArchiveHandle *AH, PGconn *conn);
#endif /* PG_DUMP_PARALLEL_H */
......@@ -4420,6 +4420,7 @@ CloneArchive(ArchiveHandle *AH)
/* The clone will have its own connection, so disregard connection state */
clone->connection = NULL;
clone->connCancel = NULL;
clone->currUser = NULL;
clone->currSchema = NULL;
clone->currTablespace = NULL;
......@@ -4493,6 +4494,9 @@ CloneArchive(ArchiveHandle *AH)
void
DeCloneArchive(ArchiveHandle *AH)
{
/* Should not have an open database connection */
Assert(AH->connection == NULL);
/* Clear format-specific state */
(AH->DeClonePtr) (AH);
......
......@@ -285,6 +285,9 @@ struct _archiveHandle
char *savedPassword; /* password for ropt->username, if known */
char *use_role;
PGconn *connection;
/* If connCancel isn't NULL, SIGINT handler will send a cancel */
PGcancel *volatile connCancel;
int connectToDB; /* Flag to indicate if direct DB connection is
* required */
ArchiverOutput outputKind; /* Flag for what we're currently writing */
......
......@@ -12,6 +12,7 @@
#include "postgres_fe.h"
#include "dumputils.h"
#include "parallel.h"
#include "pg_backup_archiver.h"
#include "pg_backup_db.h"
#include "pg_backup_utils.h"
......@@ -106,6 +107,9 @@ ReconnectToServer(ArchiveHandle *AH, const char *dbname, const char *username)
newConn = _connectDB(AH, newdbname, newusername);
/* Update ArchiveHandle's connCancel before closing old connection */
set_archive_cancel_info(AH, newConn);
PQfinish(AH->connection);
AH->connection = newConn;
......@@ -327,6 +331,9 @@ ConnectDatabase(Archive *AHX,
_check_database_version(AH);
PQsetNoticeProcessor(AH->connection, notice_processor, NULL);
/* arrange for SIGINT to issue a query cancel on this connection */
set_archive_cancel_info(AH, AH->connection);
}
/*
......@@ -337,19 +344,25 @@ void
DisconnectDatabase(Archive *AHX)
{
ArchiveHandle *AH = (ArchiveHandle *) AHX;
PGcancel *cancel;
char errbuf[1];
if (!AH->connection)
return;
if (PQtransactionStatus(AH->connection) == PQTRANS_ACTIVE)
if (AH->connCancel)
{
if ((cancel = PQgetCancel(AH->connection)))
{
PQcancel(cancel, errbuf, sizeof(errbuf));
PQfreeCancel(cancel);
}
/*
* If we have an active query, send a cancel before closing. This is
* of no use for a normal exit, but might be helpful during
* exit_horribly().
*/
if (PQtransactionStatus(AH->connection) == PQTRANS_ACTIVE)
PQcancel(AH->connCancel, errbuf, sizeof(errbuf));
/*
* Prevent signal handler from sending a cancel after this.
*/
set_archive_cancel_info(AH, NULL);
}
PQfinish(AH->connection);
......@@ -631,6 +644,11 @@ EndDBCopyMode(Archive *AHX, const char *tocEntryTag)
tocEntryTag, PQerrorMessage(AH->connection));
PQclear(res);
/* Do this to ensure we've pumped libpq back to idle state */
if (PQgetResult(AH->connection) != NULL)
write_msg(NULL, "WARNING: unexpected extra results during COPY of table \"%s\"\n",
tocEntryTag);
AH->pgCopyIn = false;
}
}
......
......@@ -356,9 +356,6 @@ _WriteData(ArchiveHandle *AH, const void *data, size_t dLen)
{
lclContext *ctx = (lclContext *) AH->formatData;
/* Are we aborting? */
checkAborting(AH);
if (dLen > 0 && cfwrite(data, dLen, ctx->dataFH) != dLen)
WRITE_ERROR_EXIT;
......@@ -407,9 +404,6 @@ _PrintFileData(ArchiveHandle *AH, char *filename)
while ((cnt = cfread(buf, buflen, cfp)))
{
/* Are we aborting? */
checkAborting(AH);
ahwrite(buf, 1, cnt, AH);
}
......@@ -529,9 +523,6 @@ _WriteBuf(ArchiveHandle *AH, const void *buf, size_t len)
{
lclContext *ctx = (lclContext *) AH->formatData;
/* Are we aborting? */
checkAborting(AH);
if (cfwrite(buf, len, ctx->dataFH) != len)
WRITE_ERROR_EXIT;
......@@ -548,9 +539,6 @@ _ReadBuf(ArchiveHandle *AH, void *buf, size_t len)
{
lclContext *ctx = (lclContext *) AH->formatData;
/* Are we aborting? */
checkAborting(AH);
/*
* If there was an I/O error, we already exited in cfread(), so here we
* exit on short reads.
......
......@@ -1780,6 +1780,11 @@ dumpTableData_copy(Archive *fout, void *dcontext)
}
PQclear(res);
/* Do this to ensure we've pumped libpq back to idle state */
if (PQgetResult(conn) != NULL)
write_msg(NULL, "WARNING: unexpected extra results during COPY of table \"%s\"\n",
classname);
destroyPQExpBuffer(q);
return 1;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment