Commit b966dd6c authored by Tom Lane's avatar Tom Lane

Add fsync capability to initdb, and use sync_file_range() if available.

Historically we have not worried about fsync'ing anything during initdb
(in fact, initdb intentionally passes -F to each backend launch to prevent
it from fsync'ing).  But with filesystems getting more aggressive about
caching data, that's not such a good plan anymore.  Make initdb do a pass
over the finished data directory tree to fsync everything.  For testing
purposes, the -N/--nosync flag can be used to restore the old behavior.

Also, testing shows that on Linux, sync_file_range() is much faster than
posix_fadvise() for hinting to the kernel that an fsync is coming,
apparently because the latter blocks on a rather small request queue while
the former doesn't.  So use this function if available in initdb, and also
in the backend's pg_flush_data() (where it currently will affect only the
speed of CREATE DATABASE's cloning step).

We will later make pg_regress invoke initdb with the --nosync flag
to avoid slowing down cases such as "make check" in contrib.  But
let's not do so until we've shaken out any portability issues in this
patch.

Jeff Davis, reviewed by Andres Freund
parent 1a9405d2
...@@ -19254,7 +19254,8 @@ fi ...@@ -19254,7 +19254,8 @@ fi
for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink towlower utime utimes wcstombs wcstombs_l
for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l
do do
as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
{ $as_echo "$as_me:$LINENO: checking for $ac_func" >&5 { $as_echo "$as_me:$LINENO: checking for $ac_func" >&5
......
...@@ -1207,7 +1207,7 @@ PGAC_VAR_INT_TIMEZONE ...@@ -1207,7 +1207,7 @@ PGAC_VAR_INT_TIMEZONE
AC_FUNC_ACCEPT_ARGTYPES AC_FUNC_ACCEPT_ARGTYPES
PGAC_FUNC_GETTIMEOFDAY_1ARG PGAC_FUNC_GETTIMEOFDAY_1ARG
AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink towlower utime utimes wcstombs wcstombs_l]) AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l])
AC_REPLACE_FUNCS(fseeko) AC_REPLACE_FUNCS(fseeko)
case $host_os in case $host_os in
......
...@@ -219,6 +219,21 @@ PostgreSQL documentation ...@@ -219,6 +219,21 @@ PostgreSQL documentation
</listitem> </listitem>
</varlistentry> </varlistentry>
<varlistentry>
<term><option>-N</option></term>
<term><option>--nosync</option></term>
<listitem>
<para>
By default, <command>initdb</command> will wait for all files to be
written safely to disk. This option causes <command>initdb</command>
to return without waiting, which is faster, but means that a
subsequent operating system crash can leave the data directory
corrupt. Generally, this option is useful for testing, but should not
be used when creating a production installation.
</para>
</listitem>
</varlistentry>
<varlistentry> <varlistentry>
<term><option>--pwfile=<replaceable>filename</></option></term> <term><option>--pwfile=<replaceable>filename</></option></term>
<listitem> <listitem>
......
...@@ -336,12 +336,15 @@ pg_fdatasync(int fd) ...@@ -336,12 +336,15 @@ pg_fdatasync(int fd)
/* /*
* pg_flush_data --- advise OS that the data described won't be needed soon * pg_flush_data --- advise OS that the data described won't be needed soon
* *
* Not all platforms have posix_fadvise; treat as noop if not available. * Not all platforms have sync_file_range or posix_fadvise; treat as no-op
* if not available.
*/ */
int int
pg_flush_data(int fd, off_t offset, off_t amount) pg_flush_data(int fd, off_t offset, off_t amount)
{ {
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) #if defined(HAVE_SYNC_FILE_RANGE)
return sync_file_range(fd, offset, amount, SYNC_FILE_RANGE_WRITE);
#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED); return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED);
#else #else
return 0; return 0;
......
...@@ -49,6 +49,7 @@ ...@@ -49,6 +49,7 @@
#include "postgres_fe.h" #include "postgres_fe.h"
#include <dirent.h> #include <dirent.h>
#include <fcntl.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <unistd.h> #include <unistd.h>
#include <locale.h> #include <locale.h>
...@@ -116,6 +117,7 @@ static const char *authmethodhost = ""; ...@@ -116,6 +117,7 @@ static const char *authmethodhost = "";
static const char *authmethodlocal = ""; static const char *authmethodlocal = "";
static bool debug = false; static bool debug = false;
static bool noclean = false; static bool noclean = false;
static bool do_sync = true;
static bool show_setting = false; static bool show_setting = false;
static char *xlog_dir = ""; static char *xlog_dir = "";
...@@ -160,6 +162,9 @@ static char *authwarning = NULL; ...@@ -160,6 +162,9 @@ static char *authwarning = NULL;
/* /*
* Centralized knowledge of switches to pass to backend * Centralized knowledge of switches to pass to backend
* *
* Note: we run the backend with -F (fsync disabled) and then do a single
* pass of fsync'ing at the end. This is faster than fsync'ing each step.
*
* Note: in the shell-script version, we also passed PGDATA as a -D switch, * Note: in the shell-script version, we also passed PGDATA as a -D switch,
* but here it is more convenient to pass it as an environment variable * but here it is more convenient to pass it as an environment variable
* (no quoting to worry about). * (no quoting to worry about).
...@@ -182,6 +187,9 @@ static char **filter_lines_with_token(char **lines, const char *token); ...@@ -182,6 +187,9 @@ static char **filter_lines_with_token(char **lines, const char *token);
#endif #endif
static char **readfile(const char *path); static char **readfile(const char *path);
static void writefile(char *path, char **lines); static void writefile(char *path, char **lines);
static void walkdir(char *path, void (*action)(char *fname, bool isdir));
static void pre_sync_fname(char *fname, bool isdir);
static void fsync_fname(char *fname, bool isdir);
static FILE *popen_check(const char *command, const char *mode); static FILE *popen_check(const char *command, const char *mode);
static void exit_nicely(void); static void exit_nicely(void);
static char *get_id(void); static char *get_id(void);
...@@ -209,6 +217,7 @@ static void load_plpgsql(void); ...@@ -209,6 +217,7 @@ static void load_plpgsql(void);
static void vacuum_db(void); static void vacuum_db(void);
static void make_template0(void); static void make_template0(void);
static void make_postgres(void); static void make_postgres(void);
static void perform_fsync(void);
static void trapsig(int signum); static void trapsig(int signum);
static void check_ok(void); static void check_ok(void);
static char *escape_quotes(const char *src); static char *escape_quotes(const char *src);
...@@ -489,6 +498,174 @@ writefile(char *path, char **lines) ...@@ -489,6 +498,174 @@ writefile(char *path, char **lines)
} }
} }
/*
* walkdir: recursively walk a directory, applying the action to each
* regular file and directory (including the named directory itself).
*
* Adapted from copydir() in copydir.c.
*/
static void
walkdir(char *path, void (*action) (char *fname, bool isdir))
{
DIR *dir;
struct dirent *direntry;
char subpath[MAXPGPATH];
dir = opendir(path);
if (dir == NULL)
{
fprintf(stderr, _("%s: could not open directory \"%s\": %s\n"),
progname, path, strerror(errno));
exit_nicely();
}
while (errno = 0, (direntry = readdir(dir)) != NULL)
{
struct stat fst;
if (strcmp(direntry->d_name, ".") == 0 ||
strcmp(direntry->d_name, "..") == 0)
continue;
snprintf(subpath, MAXPGPATH, "%s/%s", path, direntry->d_name);
if (lstat(subpath, &fst) < 0)
{
fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"),
progname, subpath, strerror(errno));
exit_nicely();
}
if (S_ISDIR(fst.st_mode))
walkdir(subpath, action);
else if (S_ISREG(fst.st_mode))
(*action) (subpath, false);
}
#ifdef WIN32
/*
* This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but not in
* released version
*/
if (GetLastError() == ERROR_NO_MORE_FILES)
errno = 0;
#endif
if (errno)
{
fprintf(stderr, _("%s: could not read directory \"%s\": %s\n"),
progname, path, strerror(errno));
exit_nicely();
}
closedir(dir);
/*
* It's important to fsync the destination directory itself as individual
* file fsyncs don't guarantee that the directory entry for the file is
* synced. Recent versions of ext4 have made the window much wider but
* it's been an issue for ext3 and other filesystems in the past.
*/
(*action) (path, true);
}
/*
* Hint to the OS that it should get ready to fsync() this file.
*/
static void
pre_sync_fname(char *fname, bool isdir)
{
#if defined(HAVE_SYNC_FILE_RANGE) || \
(defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED))
int fd;
fd = open(fname, O_RDONLY | PG_BINARY);
/*
* Some OSs don't allow us to open directories at all (Windows returns
* EACCES)
*/
if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
return;
if (fd < 0)
{
fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
progname, fname, strerror(errno));
exit_nicely();
}
/*
* Prefer sync_file_range, else use posix_fadvise. We ignore any error
* here since this operation is only a hint anyway.
*/
#if defined(HAVE_SYNC_FILE_RANGE)
sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
#endif
close(fd);
#endif
}
/*
* fsync a file or directory
*
* Try to fsync directories but ignore errors that indicate the OS
* just doesn't allow/require fsyncing directories.
*
* Adapted from fsync_fname() in copydir.c.
*/
static void
fsync_fname(char *fname, bool isdir)
{
int fd;
int returncode;
/*
* Some OSs require directories to be opened read-only whereas other
* systems don't allow us to fsync files opened read-only; so we need both
* cases here
*/
if (!isdir)
fd = open(fname, O_RDWR | PG_BINARY);
else
fd = open(fname, O_RDONLY | PG_BINARY);
/*
* Some OSs don't allow us to open directories at all (Windows returns
* EACCES)
*/
if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
return;
else if (fd < 0)
{
fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
progname, fname, strerror(errno));
exit_nicely();
}
returncode = fsync(fd);
/* Some OSs don't allow us to fsync directories at all */
if (returncode != 0 && isdir && errno == EBADF)
{
close(fd);
return;
}
if (returncode != 0)
{
fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
progname, fname, strerror(errno));
exit_nicely();
}
close(fd);
}
/* /*
* Open a subcommand with suitable error messaging * Open a subcommand with suitable error messaging
*/ */
...@@ -2092,6 +2269,47 @@ make_postgres(void) ...@@ -2092,6 +2269,47 @@ make_postgres(void)
check_ok(); check_ok();
} }
/*
* fsync everything down to disk
*/
static void
perform_fsync(void)
{
char pdir[MAXPGPATH];
fputs(_("syncing data to disk ... "), stdout);
fflush(stdout);
/*
* We need to name the parent of PGDATA. get_parent_directory() isn't
* enough here, because it can result in an empty string.
*/
snprintf(pdir, MAXPGPATH, "%s/..", pg_data);
canonicalize_path(pdir);
/*
* Hint to the OS so that we're going to fsync each of these files soon.
*/
/* first the parent of the PGDATA directory */
pre_sync_fname(pdir, true);
/* then recursively through the directory */
walkdir(pg_data, pre_sync_fname);
/*
* Now, do the fsync()s in the same order.
*/
/* first the parent of the PGDATA directory */
fsync_fname(pdir, true);
/* then recursively through the directory */
walkdir(pg_data, fsync_fname);
check_ok();
}
/* /*
* signal handler in case we are interrupted. * signal handler in case we are interrupted.
...@@ -2532,6 +2750,7 @@ usage(const char *progname) ...@@ -2532,6 +2750,7 @@ usage(const char *progname)
printf(_(" -d, --debug generate lots of debugging output\n")); printf(_(" -d, --debug generate lots of debugging output\n"));
printf(_(" -L DIRECTORY where to find the input files\n")); printf(_(" -L DIRECTORY where to find the input files\n"));
printf(_(" -n, --noclean do not clean up after errors\n")); printf(_(" -n, --noclean do not clean up after errors\n"));
printf(_(" -N, --nosync do not wait for changes to be written safely to disk\n"));
printf(_(" -s, --show show internal settings\n")); printf(_(" -s, --show show internal settings\n"));
printf(_("\nOther options:\n")); printf(_("\nOther options:\n"));
printf(_(" -V, --version output version information, then exit\n")); printf(_(" -V, --version output version information, then exit\n"));
...@@ -2621,6 +2840,7 @@ main(int argc, char *argv[]) ...@@ -2621,6 +2840,7 @@ main(int argc, char *argv[])
{"debug", no_argument, NULL, 'd'}, {"debug", no_argument, NULL, 'd'},
{"show", no_argument, NULL, 's'}, {"show", no_argument, NULL, 's'},
{"noclean", no_argument, NULL, 'n'}, {"noclean", no_argument, NULL, 'n'},
{"nosync", no_argument, NULL, 'N'},
{"xlogdir", required_argument, NULL, 'X'}, {"xlogdir", required_argument, NULL, 'X'},
{NULL, 0, NULL, 0} {NULL, 0, NULL, 0}
}; };
...@@ -2676,7 +2896,7 @@ main(int argc, char *argv[]) ...@@ -2676,7 +2896,7 @@ main(int argc, char *argv[])
/* process command-line options */ /* process command-line options */
while ((c = getopt_long(argc, argv, "dD:E:L:nU:WA:sT:X:", long_options, &option_index)) != -1) while ((c = getopt_long(argc, argv, "dD:E:L:nNU:WA:sT:X:", long_options, &option_index)) != -1)
{ {
switch (c) switch (c)
{ {
...@@ -2719,6 +2939,9 @@ main(int argc, char *argv[]) ...@@ -2719,6 +2939,9 @@ main(int argc, char *argv[])
noclean = true; noclean = true;
printf(_("Running in noclean mode. Mistakes will not be cleaned up.\n")); printf(_("Running in noclean mode. Mistakes will not be cleaned up.\n"));
break; break;
case 'N':
do_sync = false;
break;
case 'L': case 'L':
share_path = xstrdup(optarg); share_path = xstrdup(optarg);
break; break;
...@@ -3310,6 +3533,11 @@ main(int argc, char *argv[]) ...@@ -3310,6 +3533,11 @@ main(int argc, char *argv[])
make_postgres(); make_postgres();
if (do_sync)
perform_fsync();
else
printf(_("\nSync to disk skipped.\nThe data directory might become corrupt if the operating system crashes.\n"));
if (authwarning != NULL) if (authwarning != NULL)
fprintf(stderr, "%s", authwarning); fprintf(stderr, "%s", authwarning);
......
...@@ -511,6 +511,9 @@ ...@@ -511,6 +511,9 @@
/* Define to 1 if you have the `symlink' function. */ /* Define to 1 if you have the `symlink' function. */
#undef HAVE_SYMLINK #undef HAVE_SYMLINK
/* Define to 1 if you have the `sync_file_range' function. */
#undef HAVE_SYNC_FILE_RANGE
/* Define to 1 if you have the syslog interface. */ /* Define to 1 if you have the syslog interface. */
#undef HAVE_SYSLOG #undef HAVE_SYSLOG
......
...@@ -420,6 +420,9 @@ ...@@ -420,6 +420,9 @@
/* Define to 1 if you have the `symlink' function. */ /* Define to 1 if you have the `symlink' function. */
#define HAVE_SYMLINK 1 #define HAVE_SYMLINK 1
/* Define to 1 if you have the `sync_file_range' function. */
/* #undef HAVE_SYNC_FILE_RANGE */
/* Define to 1 if you have the `sysconf' function. */ /* Define to 1 if you have the `sysconf' function. */
/* #undef HAVE_SYSCONF */ /* #undef HAVE_SYSCONF */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment