Commit 61752afb authored by Thomas Munro's avatar Thomas Munro

Provide recovery_init_sync_method=syncfs.

Since commit 2ce439f3 we have opened every file in the data directory
and called fsync() at the start of crash recovery.  This can be very
slow if there are many files, leading to field complaints of systems
taking minutes or even hours to begin crash recovery.

Provide an alternative method, for Linux only, where we call syncfs() on
every possibly different filesystem under the data directory.  This is
equivalent, but avoids faulting in potentially many inodes from
potentially slow storage.

The new mode comes with some caveats, described in the documentation, so
the default value for the new setting is "fsync", preserving the older
behavior.
Reported-by: default avatarMichael Brown <michael.brown@discourse.org>
Reviewed-by: default avatarFujii Masao <masao.fujii@oss.nttdata.com>
Reviewed-by: default avatarPaul Guo <guopa@vmware.com>
Reviewed-by: default avatarBruce Momjian <bruce@momjian.us>
Reviewed-by: default avatarJustin Pryzby <pryzby@telsasoft.com>
Reviewed-by: default avatarDavid Steele <david@pgmasters.net>
Discussion: https://postgr.es/m/11bc2bb7-ecb5-3ad0-b39f-df632734cd81%40discourse.org
Discussion: https://postgr.es/m/CAEET0ZHGnbXmi8yF3ywsDZvb3m9CbdsGZgfTXscQ6agcbzcZAw%40mail.gmail.com
parent b822ae13
......@@ -15409,7 +15409,7 @@ fi
LIBS_including_readline="$LIBS"
LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink readv setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink sync_file_range uselocale wcstombs_l writev
for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink readv setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink syncfs sync_file_range uselocale wcstombs_l writev
do :
as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
......
......@@ -1701,6 +1701,7 @@ AC_CHECK_FUNCS(m4_normalize([
strchrnul
strsignal
symlink
syncfs
sync_file_range
uselocale
wcstombs_l
......
......@@ -9721,6 +9721,41 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
</listitem>
</varlistentry>
<varlistentry id="guc-recovery-init-sync-method" xreflabel="recovery_init_sync_method">
<term><varname>recovery_init_sync_method</varname> (<type>enum</type>)
<indexterm>
<primary><varname>recovery_init_sync_method</varname> configuration parameter</primary>
</indexterm>
</term>
<listitem>
<para>
When set to <literal>fsync</literal>, which is the default,
<productname>PostgreSQL</productname> will recursively open and
synchronize all files in the data directory before crash recovery
begins. The search for files will follow symbolic links for the WAL
directory and each configured tablespace (but not any other symbolic
links). This is intended to make sure that all WAL and data files are
durably stored on disk before replaying changes. This applies whenever
starting a database cluster that did not shut down cleanly, including
copies created with <application>pg_basebackup</application>.
</para>
<para>
On Linux, <literal>syncfs</literal> may be used instead, to ask the
operating system to synchronize the whole file systems that contain the
data directory, the WAL files and each tablespace (but not any other
file systems that may be reachable through symbolic links). This may
be a lot faster than the <literal>fsync</literal> setting, because it
doesn't need to open each file one by one. On the other hand, it may
be slower if a file system is shared by other applications that
modify a lot of files, since those files will also be written to disk.
Furthermore, on versions of Linux before 5.8, I/O errors encountered
while writing data to disk may not be reported to
<productname>PostgreSQL</productname>, and relevant error messages may
appear only in kernel logs.
</para>
</listitem>
</varlistentry>
</variablelist>
</sect1>
......
......@@ -72,9 +72,11 @@
#include "postgres.h"
#include <dirent.h>
#include <sys/file.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/types.h>
#ifndef WIN32
#include <sys/mman.h>
#endif
......@@ -158,6 +160,9 @@ int max_safe_fds = FD_MINFREE; /* default if not changed */
/* Whether it is safe to continue running after fsync() fails. */
bool data_sync_retry = false;
/* How SyncDataDirectory() should do its job. */
int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
/* Debugging.... */
#ifdef FDDEBUG
......@@ -3265,9 +3270,31 @@ looks_like_temp_rel_name(const char *name)
return true;
}
#ifdef HAVE_SYNCFS
static void
do_syncfs(const char *path)
{
int fd;
fd = OpenTransientFile(path, O_RDONLY);
if (fd < 0)
{
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not open %s: %m", path)));
return;
}
if (syncfs(fd) < 0)
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not sync filesystem for \"%s\": %m", path)));
CloseTransientFile(fd);
}
#endif
/*
* Issue fsync recursively on PGDATA and all its contents.
* Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
* all potential filesystem, depending on recovery_init_sync_method setting.
*
* We fsync regular files and directories wherever they are, but we
* follow symlinks only for pg_wal and immediately under pg_tblspc.
......@@ -3319,6 +3346,42 @@ SyncDataDirectory(void)
xlog_is_symlink = true;
#endif
#ifdef HAVE_SYNCFS
if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS)
{
DIR *dir;
struct dirent *de;
/*
* On Linux, we don't have to open every single file one by one. We
* can use syncfs() to sync whole filesystems. We only expect
* filesystem boundaries to exist where we tolerate symlinks, namely
* pg_wal and the tablespaces, so we call syncfs() for each of those
* directories.
*/
/* Sync the top level pgdata directory. */
do_syncfs(".");
/* If any tablespaces are configured, sync each of those. */
dir = AllocateDir("pg_tblspc");
while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
{
char path[MAXPGPATH];
if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
continue;
snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
do_syncfs(path);
}
FreeDir(dir);
/* If pg_wal is a symlink, process that too. */
if (xlog_is_symlink)
do_syncfs("pg_wal");
return;
}
#endif /* !HAVE_SYNCFS */
/*
* If possible, hint to the kernel that we're soon going to fsync the data
* directory and its contents. Errors in this step are even less
......
......@@ -488,6 +488,14 @@ const struct config_enum_entry ssl_protocol_versions_info[] = {
StaticAssertDecl(lengthof(ssl_protocol_versions_info) == (PG_TLS1_3_VERSION + 2),
"array length mismatch");
static struct config_enum_entry recovery_init_sync_method_options[] = {
{"fsync", RECOVERY_INIT_SYNC_METHOD_FSYNC, false},
#ifdef HAVE_SYNCFS
{"syncfs", RECOVERY_INIT_SYNC_METHOD_SYNCFS, false},
#endif
{NULL, 0, false}
};
static struct config_enum_entry shared_memory_options[] = {
#ifndef WIN32
{"sysv", SHMEM_TYPE_SYSV, false},
......@@ -4871,6 +4879,15 @@ static struct config_enum ConfigureNamesEnum[] =
NULL, NULL, NULL
},
{
{"recovery_init_sync_method", PGC_POSTMASTER, ERROR_HANDLING_OPTIONS,
gettext_noop("Sets the method for synchronizing the data directory before crash recovery."),
},
&recovery_init_sync_method,
RECOVERY_INIT_SYNC_METHOD_FSYNC, recovery_init_sync_method_options,
NULL, NULL, NULL
},
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0, NULL, NULL, NULL, NULL
......
......@@ -761,6 +761,7 @@
#restart_after_crash = on # reinitialize after backend crash?
#remove_temp_files_after_crash = on # remove temporary files after
# backend crash?
#recovery_init_sync_method = fsync # fsync, syncfs (Linux 5.8+)
#data_sync_retry = off # retry or panic on failure to fsync
# data?
# (change requires restart)
......
......@@ -590,6 +590,9 @@
/* Define to 1 if you have the `symlink' function. */
#undef HAVE_SYMLINK
/* Define to 1 if you have the `syncfs' function. */
#undef HAVE_SYNCFS
/* Define to 1 if you have the `sync_file_range' function. */
#undef HAVE_SYNC_FILE_RANGE
......
......@@ -45,6 +45,11 @@
#include <dirent.h>
typedef enum RecoveryInitSyncMethod {
RECOVERY_INIT_SYNC_METHOD_FSYNC,
RECOVERY_INIT_SYNC_METHOD_SYNCFS
} RecoveryInitSyncMethod;
struct iovec; /* avoid including port/pg_iovec.h here */
typedef int File;
......@@ -53,6 +58,7 @@ typedef int File;
/* GUC parameter */
extern PGDLLIMPORT int max_files_per_process;
extern PGDLLIMPORT bool data_sync_retry;
extern int recovery_init_sync_method;
/*
* This is private to fd.c, but exported for save/restore_backend_variables()
......
......@@ -388,6 +388,7 @@ sub GenerateFiles
HAVE_STRUCT_TM_TM_ZONE => undef,
HAVE_SYNC_FILE_RANGE => undef,
HAVE_SYMLINK => 1,
HAVE_SYNCFS => undef,
HAVE_SYSLOG => undef,
HAVE_SYS_EPOLL_H => undef,
HAVE_SYS_EVENT_H => undef,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment