Commit 6bcce258 authored by Tom Lane's avatar Tom Lane

Fix "pg_ctl start -w" to test child process status directly.

pg_ctl start with -w previously relied on a heuristic that the postmaster
would surely always manage to create postmaster.pid within five seconds.
Unfortunately, that fails much more often than we would like on some of the
slower, more heavily loaded buildfarm members.

We have known for quite some time that we could remove the need for that
heuristic on Unix by using fork/exec instead of system() to launch the
postmaster.  This allows us to know the exact PID of the postmaster, which
allows near-certain verification that the postmaster.pid file is the one
we want and not a leftover, and it also lets us use waitpid() to detect
reliably whether the child postmaster has exited or not.

What was blocking this change was not wanting to rewrite the Windows
version of start_postmaster() to avoid use of CMD.EXE.  That's doable
in theory but would require fooling about with stdout/stderr redirection,
and getting the handling of quote-containing postmaster switches to
stay the same might be rather ticklish.  However, we realized that
we don't have to do that to fix the problem, because we can test
whether the shell process has exited as a proxy for whether the
postmaster is still alive.  That doesn't allow an exact check of the
PID in postmaster.pid, but we're no worse off than before in that
respect; and we do get to get rid of the heuristic about how long the
postmaster might take to create postmaster.pid.

On Unix, this change means that a second "pg_ctl start -w" immediately
after another such command will now reliably fail, whereas previously
it would succeed if done within two seconds of the earlier command.
Since that's a saner behavior anyway, it's fine.  On Windows, the case can
still succeed within the same time window, since pg_ctl can't tell that the
earlier postmaster's postmaster.pid isn't the pidfile it is looking for.
To ensure stable test results on Windows, we can insert a short sleep into
the test script for pg_ctl, ensuring that the existing pidfile looks stale.
This hack can be removed if we ever do rewrite start_postmaster(), but that
no longer seems like a high-priority thing to do.

Back-patch to all supported versions, both because the current behavior
is buggy and because we must do that if we want the buildfarm failures
to go away.

Tom Lane and Michael Paquier
parent 7732d49c
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include <time.h> #include <time.h>
#include <sys/types.h> #include <sys/types.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/wait.h>
#include <unistd.h> #include <unistd.h>
#ifdef HAVE_SYS_RESOURCE_H #ifdef HAVE_SYS_RESOURCE_H
...@@ -153,10 +154,10 @@ static int CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, ...@@ -153,10 +154,10 @@ static int CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo,
static pgpid_t get_pgpid(bool is_status_request); static pgpid_t get_pgpid(bool is_status_request);
static char **readfile(const char *path); static char **readfile(const char *path);
static void free_readfile(char **optlines); static void free_readfile(char **optlines);
static int start_postmaster(void); static pgpid_t start_postmaster(void);
static void read_post_opts(void); static void read_post_opts(void);
static PGPing test_postmaster_connection(bool); static PGPing test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint);
static bool postmaster_is_alive(pid_t pid); static bool postmaster_is_alive(pid_t pid);
#if defined(HAVE_GETRLIMIT) && defined(RLIMIT_CORE) #if defined(HAVE_GETRLIMIT) && defined(RLIMIT_CORE)
...@@ -419,36 +420,73 @@ free_readfile(char **optlines) ...@@ -419,36 +420,73 @@ free_readfile(char **optlines)
* start/test/stop routines * start/test/stop routines
*/ */
static int /*
* Start the postmaster and return its PID.
*
* Currently, on Windows what we return is the PID of the shell process
* that launched the postmaster (and, we trust, is waiting for it to exit).
* So the PID is usable for "is the postmaster still running" checks,
* but cannot be compared directly to postmaster.pid.
*
* On Windows, we also save aside a handle to the shell process in
* "postmasterProcess", which the caller should close when done with it.
*/
static pgpid_t
start_postmaster(void) start_postmaster(void)
{ {
char cmd[MAXPGPATH]; char cmd[MAXPGPATH];
#ifndef WIN32 #ifndef WIN32
pgpid_t pm_pid;
/* Flush stdio channels just before fork, to avoid double-output problems */
fflush(stdout);
fflush(stderr);
pm_pid = fork();
if (pm_pid < 0)
{
/* fork failed */
write_stderr(_("%s: could not start server: %s\n"),
progname, strerror(errno));
exit(1);
}
if (pm_pid > 0)
{
/* fork succeeded, in parent */
return pm_pid;
}
/* fork succeeded, in child */
/* /*
* Since there might be quotes to handle here, it is easier simply to pass * Since there might be quotes to handle here, it is easier simply to pass
* everything to a shell to process them. * everything to a shell to process them. Use exec so that the postmaster
* * has the same PID as the current child process.
* XXX it would be better to fork and exec so that we would know the child
* postmaster's PID directly; then test_postmaster_connection could use
* the PID without having to rely on reading it back from the pidfile.
*/ */
if (log_file != NULL) if (log_file != NULL)
snprintf(cmd, MAXPGPATH, "\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1 &", snprintf(cmd, MAXPGPATH, "exec \"%s\" %s%s < \"%s\" >> \"%s\" 2>&1",
exec_path, pgdata_opt, post_opts, exec_path, pgdata_opt, post_opts,
DEVNULL, log_file); DEVNULL, log_file);
else else
snprintf(cmd, MAXPGPATH, "\"%s\" %s%s < \"%s\" 2>&1 &", snprintf(cmd, MAXPGPATH, "exec \"%s\" %s%s < \"%s\" 2>&1",
exec_path, pgdata_opt, post_opts, DEVNULL); exec_path, pgdata_opt, post_opts, DEVNULL);
return system(cmd); (void) execl("/bin/sh", "/bin/sh", "-c", cmd, (char *) NULL);
/* exec failed */
write_stderr(_("%s: could not start server: %s\n"),
progname, strerror(errno));
exit(1);
return 0; /* keep dumb compilers quiet */
#else /* WIN32 */ #else /* WIN32 */
/* /*
* On win32 we don't use system(). So we don't need to use & (which would * As with the Unix case, it's easiest to use the shell (CMD.EXE) to
* be START /B on win32). However, we still call the shell (CMD.EXE) with * handle redirection etc. Unfortunately CMD.EXE lacks any equivalent of
* it to handle redirection etc. * "exec", so we don't get to find out the postmaster's PID immediately.
*/ */
PROCESS_INFORMATION pi; PROCESS_INFORMATION pi;
...@@ -460,10 +498,15 @@ start_postmaster(void) ...@@ -460,10 +498,15 @@ start_postmaster(void)
exec_path, pgdata_opt, post_opts, DEVNULL); exec_path, pgdata_opt, post_opts, DEVNULL);
if (!CreateRestrictedProcess(cmd, &pi, false)) if (!CreateRestrictedProcess(cmd, &pi, false))
return GetLastError(); {
CloseHandle(pi.hProcess); write_stderr(_("%s: could not start server: error code %lu\n"),
progname, (unsigned long) GetLastError());
exit(1);
}
/* Don't close command process handle here; caller must do so */
postmasterProcess = pi.hProcess;
CloseHandle(pi.hThread); CloseHandle(pi.hThread);
return 0; return pi.dwProcessId; /* Shell's PID, not postmaster's! */
#endif /* WIN32 */ #endif /* WIN32 */
} }
...@@ -472,15 +515,21 @@ start_postmaster(void) ...@@ -472,15 +515,21 @@ start_postmaster(void)
/* /*
* Find the pgport and try a connection * Find the pgport and try a connection
* *
* On Unix, pm_pid is the PID of the just-launched postmaster. On Windows,
* it may be the PID of an ancestor shell process, so we can't check the
* contents of postmaster.pid quite as carefully.
*
* On Windows, the static variable postmasterProcess is an implicit argument
* to this routine; it contains a handle to the postmaster process or an
* ancestor shell process thereof.
*
* Note that the checkpoint parameter enables a Windows service control * Note that the checkpoint parameter enables a Windows service control
* manager checkpoint, it's got nothing to do with database checkpoints!! * manager checkpoint, it's got nothing to do with database checkpoints!!
*/ */
static PGPing static PGPing
test_postmaster_connection(bool do_checkpoint) test_postmaster_connection(pgpid_t pm_pid, bool do_checkpoint)
{ {
PGPing ret = PQPING_NO_RESPONSE; PGPing ret = PQPING_NO_RESPONSE;
bool found_stale_pidfile = false;
pgpid_t pm_pid = 0;
char connstr[MAXPGPATH * 2 + 256]; char connstr[MAXPGPATH * 2 + 256];
int i; int i;
...@@ -535,29 +584,27 @@ test_postmaster_connection(bool do_checkpoint) ...@@ -535,29 +584,27 @@ test_postmaster_connection(bool do_checkpoint)
optlines[5] != NULL) optlines[5] != NULL)
{ {
/* File is complete enough for us, parse it */ /* File is complete enough for us, parse it */
long pmpid; pgpid_t pmpid;
time_t pmstart; time_t pmstart;
/* /*
* Make sanity checks. If it's for a standalone backend * Make sanity checks. If it's for the wrong PID, or the
* (negative PID), or the recorded start time is before * recorded start time is before pg_ctl started, then
* pg_ctl started, then either we are looking at the wrong * either we are looking at the wrong data directory, or
* data directory, or this is a pre-existing pidfile that * this is a pre-existing pidfile that hasn't (yet?) been
* hasn't (yet?) been overwritten by our child postmaster. * overwritten by our child postmaster. Allow 2 seconds
* Allow 2 seconds slop for possible cross-process clock * slop for possible cross-process clock skew.
* skew.
*/ */
pmpid = atol(optlines[LOCK_FILE_LINE_PID - 1]); pmpid = atol(optlines[LOCK_FILE_LINE_PID - 1]);
pmstart = atol(optlines[LOCK_FILE_LINE_START_TIME - 1]); pmstart = atol(optlines[LOCK_FILE_LINE_START_TIME - 1]);
if (pmpid <= 0 || pmstart < start_time - 2) if (pmstart >= start_time - 2 &&
{ #ifndef WIN32
/* pmpid == pm_pid
* Set flag to report stale pidfile if it doesn't get #else
* overwritten before we give up waiting. /* Windows can only reject standalone-backend PIDs */
*/ pmpid > 0
found_stale_pidfile = true; #endif
} )
else
{ {
/* /*
* OK, seems to be a valid pidfile from our child. * OK, seems to be a valid pidfile from our child.
...@@ -567,9 +614,6 @@ test_postmaster_connection(bool do_checkpoint) ...@@ -567,9 +614,6 @@ test_postmaster_connection(bool do_checkpoint)
char *hostaddr; char *hostaddr;
char host_str[MAXPGPATH]; char host_str[MAXPGPATH];
found_stale_pidfile = false;
pm_pid = (pgpid_t) pmpid;
/* /*
* Extract port number and host string to use. Prefer * Extract port number and host string to use. Prefer
* using Unix socket if available. * using Unix socket if available.
...@@ -635,42 +679,23 @@ test_postmaster_connection(bool do_checkpoint) ...@@ -635,42 +679,23 @@ test_postmaster_connection(bool do_checkpoint)
} }
/* /*
* The postmaster should create postmaster.pid very soon after being * Check whether the child postmaster process is still alive. This
* started. If it's not there after we've waited 5 or more seconds, * lets us exit early if the postmaster fails during startup.
* assume startup failed and give up waiting. (Note this covers both *
* cases where the pidfile was never created, and where it was created * On Windows, we may be checking the postmaster's parent shell, but
* and then removed during postmaster exit.) Also, if there *is* a * that's fine for this purpose.
* file there but it appears stale, issue a suitable warning and give
* up waiting.
*/ */
if (i >= 5) #ifndef WIN32
{ {
struct stat statbuf; int exitstatus;
if (stat(pid_file, &statbuf) != 0) if (waitpid((pid_t) pm_pid, &exitstatus, WNOHANG) == (pid_t) pm_pid)
{
if (errno != ENOENT)
write_stderr(_("\n%s: could not stat file \"%s\": %s\n"),
progname, pid_file, strerror(errno));
return PQPING_NO_RESPONSE;
}
if (found_stale_pidfile)
{
write_stderr(_("\n%s: this data directory appears to be running a pre-existing postmaster\n"),
progname);
return PQPING_NO_RESPONSE; return PQPING_NO_RESPONSE;
}
} }
#else
/* if (WaitForSingleObject(postmasterProcess, 0) == WAIT_OBJECT_0)
* If we've been able to identify the child postmaster's PID, check
* the process is still alive. This covers cases where the postmaster
* successfully created the pidfile but then crashed without removing
* it.
*/
if (pm_pid > 0 && !postmaster_is_alive((pid_t) pm_pid))
return PQPING_NO_RESPONSE; return PQPING_NO_RESPONSE;
#endif
/* No response, or startup still in process; wait */ /* No response, or startup still in process; wait */
#if defined(WIN32) #if defined(WIN32)
...@@ -836,7 +861,7 @@ static void ...@@ -836,7 +861,7 @@ static void
do_start(void) do_start(void)
{ {
pgpid_t old_pid = 0; pgpid_t old_pid = 0;
int exitcode; pgpid_t pm_pid;
if (ctl_command != RESTART_COMMAND) if (ctl_command != RESTART_COMMAND)
{ {
...@@ -876,19 +901,13 @@ do_start(void) ...@@ -876,19 +901,13 @@ do_start(void)
} }
#endif #endif
exitcode = start_postmaster(); pm_pid = start_postmaster();
if (exitcode != 0)
{
write_stderr(_("%s: could not start server: exit code was %d\n"),
progname, exitcode);
exit(1);
}
if (do_wait) if (do_wait)
{ {
print_msg(_("waiting for server to start...")); print_msg(_("waiting for server to start..."));
switch (test_postmaster_connection(false)) switch (test_postmaster_connection(pm_pid, false))
{ {
case PQPING_OK: case PQPING_OK:
print_msg(_(" done\n")); print_msg(_(" done\n"));
...@@ -914,6 +933,12 @@ do_start(void) ...@@ -914,6 +933,12 @@ do_start(void)
} }
else else
print_msg(_("server starting\n")); print_msg(_("server starting\n"));
#ifdef WIN32
/* Now we don't need the handle to the shell process anymore */
CloseHandle(postmasterProcess);
postmasterProcess = INVALID_HANDLE_VALUE;
#endif
} }
...@@ -1585,7 +1610,7 @@ pgwin32_ServiceMain(DWORD argc, LPTSTR *argv) ...@@ -1585,7 +1610,7 @@ pgwin32_ServiceMain(DWORD argc, LPTSTR *argv)
if (do_wait) if (do_wait)
{ {
write_eventlog(EVENTLOG_INFORMATION_TYPE, _("Waiting for server startup...\n")); write_eventlog(EVENTLOG_INFORMATION_TYPE, _("Waiting for server startup...\n"));
if (test_postmaster_connection(true) != PQPING_OK) if (test_postmaster_connection(postmasterPID, true) != PQPING_OK)
{ {
write_eventlog(EVENTLOG_ERROR_TYPE, _("Timed out waiting for server startup\n")); write_eventlog(EVENTLOG_ERROR_TYPE, _("Timed out waiting for server startup\n"));
pgwin32_SetServiceStatus(SERVICE_STOPPED); pgwin32_SetServiceStatus(SERVICE_STOPPED);
...@@ -1606,10 +1631,9 @@ pgwin32_ServiceMain(DWORD argc, LPTSTR *argv) ...@@ -1606,10 +1631,9 @@ pgwin32_ServiceMain(DWORD argc, LPTSTR *argv)
{ {
/* /*
* status.dwCheckPoint can be incremented by * status.dwCheckPoint can be incremented by
* test_postmaster_connection(true), so it might not start * test_postmaster_connection(), so it might not start from 0.
* from 0.
*/ */
int maxShutdownCheckPoint = status.dwCheckPoint + 12;; int maxShutdownCheckPoint = status.dwCheckPoint + 12;
kill(postmasterPID, SIGINT); kill(postmasterPID, SIGINT);
......
...@@ -34,8 +34,13 @@ else ...@@ -34,8 +34,13 @@ else
close CONF; close CONF;
command_ok([ 'pg_ctl', 'start', '-D', "$tempdir/data", '-w' ], command_ok([ 'pg_ctl', 'start', '-D', "$tempdir/data", '-w' ],
'pg_ctl start -w'); 'pg_ctl start -w');
command_ok([ 'pg_ctl', 'start', '-D', "$tempdir/data", '-w' ], # sleep here is because Windows builds can't check postmaster.pid exactly,
'second pg_ctl start succeeds'); # so they may mistake a pre-existing postmaster.pid for one created by the
# postmaster they start. Waiting more than the 2 seconds slop time allowed
# by test_postmaster_connection prevents that mistake.
sleep 3 if ($windows_os);
command_fails([ 'pg_ctl', 'start', '-D', "$tempdir/data", '-w' ],
'second pg_ctl start -w fails');
command_ok([ 'pg_ctl', 'stop', '-D', "$tempdir/data", '-w', '-m', 'fast' ], command_ok([ 'pg_ctl', 'stop', '-D', "$tempdir/data", '-w', '-m', 'fast' ],
'pg_ctl stop -w'); 'pg_ctl stop -w');
command_fails([ 'pg_ctl', 'stop', '-D', "$tempdir/data", '-w', '-m', 'fast' ], command_fails([ 'pg_ctl', 'stop', '-D', "$tempdir/data", '-w', '-m', 'fast' ],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment