Commit ade2d61e authored by Tom Lane's avatar Tom Lane

Improve detection of child-process SIGPIPE failures.

Commit ffa4cbd6 added logic to detect SIGPIPE failure of a COPY child
process, but it only worked correctly if the SIGPIPE occurred in the
immediate child process.  Depending on the shell in use and the
complexity of the shell command string, we might instead get back
an exit code of 128 + SIGPIPE, representing a shell error exit
reporting SIGPIPE in the child process.

We could just hack up ClosePipeToProgram() to add the extra case,
but it seems like this is a fairly general issue deserving a more
general and better-documented solution.  I chose to add a couple
of functions in src/common/wait_error.c, which is a natural place
to know about wait-result encodings, that will test for either a
specific child-process signal type or any child-process signal failure.
Then, adjust other places that were doing ad-hoc tests of this type
to use the common functions.

In RestoreArchivedFile, this fixes a race condition affecting whether
the process will report an error or just silently proc_exit(1): before,
that depended on whether the intermediate shell got SIGTERM'd itself
or reported a child process failing on SIGTERM.

Like the previous patch, back-patch to v10; we could go further
but there seems no real need to.

Per report from Erik Rijkers.

Discussion: https://postgr.es/m/f3683f87ab1701bea5d86a7742b22432@xs4all.nl
parent 5e092800
...@@ -59,7 +59,6 @@ RestoreArchivedFile(char *path, const char *xlogfname, ...@@ -59,7 +59,6 @@ RestoreArchivedFile(char *path, const char *xlogfname,
char *endp; char *endp;
const char *sp; const char *sp;
int rc; int rc;
bool signaled;
struct stat stat_buf; struct stat stat_buf;
XLogSegNo restartSegNo; XLogSegNo restartSegNo;
XLogRecPtr restartRedoPtr; XLogRecPtr restartRedoPtr;
...@@ -289,17 +288,12 @@ RestoreArchivedFile(char *path, const char *xlogfname, ...@@ -289,17 +288,12 @@ RestoreArchivedFile(char *path, const char *xlogfname,
* will perform an immediate shutdown when it sees us exiting * will perform an immediate shutdown when it sees us exiting
* unexpectedly. * unexpectedly.
* *
* Per the Single Unix Spec, shells report exit status > 128 when a called * We treat hard shell errors such as "command not found" as fatal, too.
* command died on a signal. Also, 126 and 127 are used to report
* problems such as an unfindable command; treat those as fatal errors
* too.
*/ */
if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM) if (wait_result_is_signal(rc, SIGTERM))
proc_exit(1); proc_exit(1);
signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125; ereport(wait_result_is_any_signal(rc, true) ? FATAL : DEBUG2,
ereport(signaled ? FATAL : DEBUG2,
(errmsg("could not restore file \"%s\" from archive: %s", (errmsg("could not restore file \"%s\" from archive: %s",
xlogfname, wait_result_to_str(rc)))); xlogfname, wait_result_to_str(rc))));
...@@ -335,7 +329,6 @@ ExecuteRecoveryCommand(const char *command, const char *commandName, bool failOn ...@@ -335,7 +329,6 @@ ExecuteRecoveryCommand(const char *command, const char *commandName, bool failOn
char *endp; char *endp;
const char *sp; const char *sp;
int rc; int rc;
bool signaled;
XLogSegNo restartSegNo; XLogSegNo restartSegNo;
XLogRecPtr restartRedoPtr; XLogRecPtr restartRedoPtr;
TimeLineID restartTli; TimeLineID restartTli;
...@@ -403,12 +396,9 @@ ExecuteRecoveryCommand(const char *command, const char *commandName, bool failOn ...@@ -403,12 +396,9 @@ ExecuteRecoveryCommand(const char *command, const char *commandName, bool failOn
{ {
/* /*
* If the failure was due to any sort of signal, it's best to punt and * If the failure was due to any sort of signal, it's best to punt and
* abort recovery. See also detailed comments on signals in * abort recovery. See comments in RestoreArchivedFile().
* RestoreArchivedFile().
*/ */
signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125; ereport((failOnSignal && wait_result_is_any_signal(rc, true)) ? FATAL : WARNING,
ereport((signaled && failOnSignal) ? FATAL : WARNING,
/*------ /*------
translator: First %s represents a postgresql.conf parameter name like translator: First %s represents a postgresql.conf parameter name like
"recovery_end_command", the 2nd is the value of that parameter, the "recovery_end_command", the 2nd is the value of that parameter, the
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
#include <ctype.h> #include <ctype.h>
#include <unistd.h> #include <unistd.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/wait.h>
#include "access/heapam.h" #include "access/heapam.h"
#include "access/htup_details.h" #include "access/htup_details.h"
...@@ -1718,7 +1717,7 @@ ClosePipeToProgram(CopyState cstate) ...@@ -1718,7 +1717,7 @@ ClosePipeToProgram(CopyState cstate)
* problem. * problem.
*/ */
if (cstate->is_copy_from && !cstate->reached_eof && if (cstate->is_copy_from && !cstate->reached_eof &&
WIFSIGNALED(pclose_rc) && WTERMSIG(pclose_rc) == SIGPIPE) wait_result_is_signal(pclose_rc, SIGPIPE))
return; return;
ereport(ERROR, ereport(ERROR,
......
...@@ -627,13 +627,11 @@ pgarch_archiveXlog(char *xlog) ...@@ -627,13 +627,11 @@ pgarch_archiveXlog(char *xlog)
* If either the shell itself, or a called command, died on a signal, * If either the shell itself, or a called command, died on a signal,
* abort the archiver. We do this because system() ignores SIGINT and * abort the archiver. We do this because system() ignores SIGINT and
* SIGQUIT while waiting; so a signal is very likely something that * SIGQUIT while waiting; so a signal is very likely something that
* should have interrupted us too. If we overreact it's no big deal, * should have interrupted us too. Also die if the shell got a hard
* the postmaster will just start the archiver again. * "command not found" type of error. If we overreact it's no big
* * deal, the postmaster will just start the archiver again.
* Per the Single Unix Spec, shells report exit status > 128 when a
* called command died on a signal.
*/ */
int lev = (WIFSIGNALED(rc) || WEXITSTATUS(rc) > 128) ? FATAL : LOG; int lev = wait_result_is_any_signal(rc, true) ? FATAL : LOG;
if (WIFEXITED(rc)) if (WIFEXITED(rc))
{ {
......
...@@ -82,3 +82,46 @@ wait_result_to_str(int exitstatus) ...@@ -82,3 +82,46 @@ wait_result_to_str(int exitstatus)
return pstrdup(str); return pstrdup(str);
} }
/*
* Return true if a wait(2) result indicates that the child process
* died due to the specified signal.
*
* The reason this is worth having a wrapper function for is that
* there are two cases: the signal might have been received by our
* immediate child process, or there might've been a shell process
* between us and the child that died. The shell will, per POSIX,
* report the child death using exit code 128 + signal number.
*
* If there is no possibility of an intermediate shell, this function
* need not (and probably should not) be used.
*/
bool
wait_result_is_signal(int exit_status, int signum)
{
if (WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum)
return true;
if (WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == 128 + signum)
return true;
return false;
}
/*
* Return true if a wait(2) result indicates that the child process
* died due to any signal. We consider either direct child death
* or a shell report of child process death as matching the condition.
*
* If include_command_not_found is true, also return true for shell
* exit codes indicating "command not found" and the like
* (specifically, exit codes 126 and 127; see above).
*/
bool
wait_result_is_any_signal(int exit_status, bool include_command_not_found)
{
if (WIFSIGNALED(exit_status))
return true;
if (WIFEXITED(exit_status) &&
WEXITSTATUS(exit_status) > (include_command_not_found ? 125 : 128))
return true;
return false;
}
...@@ -519,5 +519,7 @@ extern char *escape_single_quotes_ascii(const char *src); ...@@ -519,5 +519,7 @@ extern char *escape_single_quotes_ascii(const char *src);
/* port/wait_error.c */ /* port/wait_error.c */
extern char *wait_result_to_str(int exit_status); extern char *wait_result_to_str(int exit_status);
extern bool wait_result_is_signal(int exit_status, int signum);
extern bool wait_result_is_any_signal(int exit_status, bool include_command_not_found);
#endif /* PG_PORT_H */ #endif /* PG_PORT_H */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment