Commit d8179b00 authored by Tom Lane's avatar Tom Lane

Fix fsync-at-startup code to not treat errors as fatal.

Commit 2ce439f3 introduced a rather serious
regression, namely that if its scan of the data directory came across any
un-fsync-able files, it would fail and thereby prevent database startup.
Worse yet, symlinks to such files also caused the problem, which meant that
crash restart was guaranteed to fail on certain common installations such
as older Debian.

After discussion, we agreed that (1) failure to start is worse than any
consequence of not fsync'ing is likely to be, therefore treat all errors
in this code as nonfatal; (2) we should not chase symlinks other than
those that are expected to exist, namely pg_xlog/ and tablespace links
under pg_tblspc/.  The latter restriction avoids possibly fsync'ing a
much larger part of the filesystem than intended, if the user has left
random symlinks hanging about in the data directory.

This commit takes care of that and also does some code beautification,
mainly moving the relevant code into fd.c, which seems a much better place
for it than xlog.c, and making sure that the conditional compilation for
the pre_sync_fname pass has something to do with whether pg_flush_data
works.

I also relocated the call site in xlog.c down a few lines; it seems a
bit silly to be doing this before ValidateXLOGDirectoryStructure().

The similar logic in initdb.c ought to be made to match this, but that
change is noncritical and will be dealt with separately.

Back-patch to all active branches, like the prior commit.

Abhijit Menon-Sen and Tom Lane
parent d5442cb2
...@@ -866,8 +866,6 @@ static void WALInsertLockAcquireExclusive(void); ...@@ -866,8 +866,6 @@ static void WALInsertLockAcquireExclusive(void);
static void WALInsertLockRelease(void); static void WALInsertLockRelease(void);
static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
static void fsync_pgdata(char *datadir);
/* /*
* Insert an XLOG record represented by an already-constructed chain of data * Insert an XLOG record represented by an already-constructed chain of data
* chunks. This is a low-level routine; to construct the WAL record header * chunks. This is a low-level routine; to construct the WAL record header
...@@ -5951,18 +5949,6 @@ StartupXLOG(void) ...@@ -5951,18 +5949,6 @@ StartupXLOG(void)
(errmsg("database system was interrupted; last known up at %s", (errmsg("database system was interrupted; last known up at %s",
str_time(ControlFile->time)))); str_time(ControlFile->time))));
/*
* If we previously crashed, there might be data which we had written,
* intending to fsync it, but which we had not actually fsync'd yet.
* Therefore, a power failure in the near future might cause earlier
* unflushed writes to be lost, even though more recent data written to
* disk from here on would be persisted. To avoid that, fsync the entire
* data directory.
*/
if (ControlFile->state != DB_SHUTDOWNED &&
ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
fsync_pgdata(data_directory);
/* This is just to allow attaching to startup process with a debugger */ /* This is just to allow attaching to startup process with a debugger */
#ifdef XLOG_REPLAY_DELAY #ifdef XLOG_REPLAY_DELAY
if (ControlFile->state != DB_SHUTDOWNED) if (ControlFile->state != DB_SHUTDOWNED)
...@@ -5976,6 +5962,18 @@ StartupXLOG(void) ...@@ -5976,6 +5962,18 @@ StartupXLOG(void)
*/ */
ValidateXLOGDirectoryStructure(); ValidateXLOGDirectoryStructure();
/*
* If we previously crashed, there might be data which we had written,
* intending to fsync it, but which we had not actually fsync'd yet.
* Therefore, a power failure in the near future might cause earlier
* unflushed writes to be lost, even though more recent data written to
* disk from here on would be persisted. To avoid that, fsync the entire
* data directory.
*/
if (ControlFile->state != DB_SHUTDOWNED &&
ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
SyncDataDirectory();
/* /*
* Initialize on the assumption we want to recover to the latest timeline * Initialize on the assumption we want to recover to the latest timeline
* that's active according to pg_control. * that's active according to pg_control.
...@@ -11602,31 +11600,3 @@ SetWalWriterSleeping(bool sleeping) ...@@ -11602,31 +11600,3 @@ SetWalWriterSleeping(bool sleeping)
XLogCtl->WalWriterSleeping = sleeping; XLogCtl->WalWriterSleeping = sleeping;
SpinLockRelease(&XLogCtl->info_lck); SpinLockRelease(&XLogCtl->info_lck);
} }
/*
* Issue fsync recursively on PGDATA and all its contents.
*/
static void
fsync_pgdata(char *datadir)
{
if (!enableFsync)
return;
/*
* If possible, hint to the kernel that we're soon going to fsync the data
* directory and its contents.
*/
#if defined(HAVE_SYNC_FILE_RANGE) || \
(defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED))
walkdir(datadir, pre_sync_fname);
#endif
/*
* Now we do the fsync()s in the same order.
*
* It's important to fsync the destination directory itself as individual
* file fsyncs don't guarantee that the directory entry for the file is
* synced.
*/
walkdir(datadir, fsync_fname);
}
...@@ -79,6 +79,13 @@ ...@@ -79,6 +79,13 @@
#include "utils/resowner_private.h" #include "utils/resowner_private.h"
/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
#if defined(HAVE_SYNC_FILE_RANGE)
#define PG_FLUSH_DATA_WORKS 1
#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
#define PG_FLUSH_DATA_WORKS 1
#endif
/* /*
* We must leave some file descriptors free for system(), the dynamic loader, * We must leave some file descriptors free for system(), the dynamic loader,
* and other code that tries to open files without consulting fd.c. This * and other code that tries to open files without consulting fd.c. This
...@@ -283,6 +290,8 @@ static int FileAccess(File file); ...@@ -283,6 +290,8 @@ static int FileAccess(File file);
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError); static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
static bool reserveAllocatedDesc(void); static bool reserveAllocatedDesc(void);
static int FreeDesc(AllocateDesc *desc); static int FreeDesc(AllocateDesc *desc);
static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel);
static void AtProcExit_Files(int code, Datum arg); static void AtProcExit_Files(int code, Datum arg);
static void CleanupTempFiles(bool isProcExit); static void CleanupTempFiles(bool isProcExit);
static void RemovePgTempFilesInDir(const char *tmpdirname); static void RemovePgTempFilesInDir(const char *tmpdirname);
...@@ -290,6 +299,15 @@ static void RemovePgTempRelationFiles(const char *tsdirname); ...@@ -290,6 +299,15 @@ static void RemovePgTempRelationFiles(const char *tsdirname);
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname); static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
static bool looks_like_temp_rel_name(const char *name); static bool looks_like_temp_rel_name(const char *name);
static void walkdir(const char *path,
void (*action) (const char *fname, bool isdir, int elevel),
bool process_symlinks,
int elevel);
#ifdef PG_FLUSH_DATA_WORKS
static void pre_sync_fname(const char *fname, bool isdir, int elevel);
#endif
static void fsync_fname_ext(const char *fname, bool isdir, int elevel);
/* /*
* pg_fsync --- do fsync with or without writethrough * pg_fsync --- do fsync with or without writethrough
...@@ -372,14 +390,18 @@ pg_fdatasync(int fd) ...@@ -372,14 +390,18 @@ pg_fdatasync(int fd)
int int
pg_flush_data(int fd, off_t offset, off_t amount) pg_flush_data(int fd, off_t offset, off_t amount)
{ {
#ifdef PG_FLUSH_DATA_WORKS
if (enableFsync) if (enableFsync)
{ {
#if defined(HAVE_SYNC_FILE_RANGE) #if defined(HAVE_SYNC_FILE_RANGE)
return sync_file_range(fd, offset, amount, SYNC_FILE_RANGE_WRITE); return sync_file_range(fd, offset, amount, SYNC_FILE_RANGE_WRITE);
#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED); return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED);
#else
#error PG_FLUSH_DATA_WORKS should not have been defined
#endif #endif
} }
#endif
return 0; return 0;
} }
...@@ -1942,22 +1964,35 @@ TryAgain: ...@@ -1942,22 +1964,35 @@ TryAgain:
*/ */
struct dirent * struct dirent *
ReadDir(DIR *dir, const char *dirname) ReadDir(DIR *dir, const char *dirname)
{
return ReadDirExtended(dir, dirname, ERROR);
}
/*
* Alternate version that allows caller to specify the elevel for any
* error report. If elevel < ERROR, returns NULL on any error.
*/
static struct dirent *
ReadDirExtended(DIR *dir, const char *dirname, int elevel)
{ {
struct dirent *dent; struct dirent *dent;
/* Give a generic message for AllocateDir failure, if caller didn't */ /* Give a generic message for AllocateDir failure, if caller didn't */
if (dir == NULL) if (dir == NULL)
ereport(ERROR, {
ereport(elevel,
(errcode_for_file_access(), (errcode_for_file_access(),
errmsg("could not open directory \"%s\": %m", errmsg("could not open directory \"%s\": %m",
dirname))); dirname)));
return NULL;
}
errno = 0; errno = 0;
if ((dent = readdir(dir)) != NULL) if ((dent = readdir(dir)) != NULL)
return dent; return dent;
if (errno) if (errno)
ereport(ERROR, ereport(elevel,
(errcode_for_file_access(), (errcode_for_file_access(),
errmsg("could not read directory \"%s\": %m", errmsg("could not read directory \"%s\": %m",
dirname))); dirname)));
...@@ -2440,54 +2475,121 @@ looks_like_temp_rel_name(const char *name) ...@@ -2440,54 +2475,121 @@ looks_like_temp_rel_name(const char *name)
return true; return true;
} }
/* /*
* Hint to the OS that it should get ready to fsync() this file. * Issue fsync recursively on PGDATA and all its contents.
* *
* Adapted from pre_sync_fname in initdb.c * We fsync regular files and directories wherever they are, but we
* follow symlinks only for pg_xlog and immediately under pg_tblspc.
* Other symlinks are presumed to point at files we're not responsible
* for fsyncing, and might not have privileges to write at all.
*
* Errors are logged but not considered fatal; that's because this is used
* only during database startup, to deal with the possibility that there are
* issued-but-unsynced writes pending against the data directory. We want to
* ensure that such writes reach disk before anything that's done in the new
* run. However, aborting on error would result in failure to start for
* harmless cases such as read-only files in the data directory, and that's
* not good either.
*
* Note we assume we're chdir'd into PGDATA to begin with.
*/ */
void void
pre_sync_fname(char *fname, bool isdir) SyncDataDirectory(void)
{ {
int fd; bool xlog_is_symlink;
fd = BasicOpenFile(fname, O_RDONLY | PG_BINARY, 0); /* We can skip this whole thing if fsync is disabled. */
if (!enableFsync)
return;
/* /*
* Some OSs don't allow us to open directories at all (Windows returns * If pg_xlog is a symlink, we'll need to recurse into it separately,
* EACCES) * because the first walkdir below will ignore it.
*/ */
if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) xlog_is_symlink = false;
return;
if (fd < 0) #ifndef WIN32
ereport(FATAL, {
(errmsg("could not open file \"%s\": %m", struct stat st;
fname)));
pg_flush_data(fd, 0, 0); if (lstat("pg_xlog", &st) < 0)
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m",
"pg_xlog")));
else if (S_ISLNK(st.st_mode))
xlog_is_symlink = true;
}
#else
if (pgwin32_is_junction("pg_xlog"))
xlog_is_symlink = true;
#endif
close(fd); /*
* If possible, hint to the kernel that we're soon going to fsync the data
* directory and its contents. Errors in this step are even less
* interesting than normal, so log them only at DEBUG1.
*/
#ifdef PG_FLUSH_DATA_WORKS
walkdir(".", pre_sync_fname, false, DEBUG1);
if (xlog_is_symlink)
walkdir("pg_xlog", pre_sync_fname, false, DEBUG1);
walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
#endif
/*
* Now we do the fsync()s in the same order.
*
* The main call ignores symlinks, so in addition to specially processing
* pg_xlog if it's a symlink, pg_tblspc has to be visited separately with
* process_symlinks = true. Note that if there are any plain directories
* in pg_tblspc, they'll get fsync'd twice. That's not an expected case
* so we don't worry about optimizing it.
*/
walkdir(".", fsync_fname_ext, false, LOG);
if (xlog_is_symlink)
walkdir("pg_xlog", fsync_fname_ext, false, LOG);
walkdir("pg_tblspc", fsync_fname_ext, true, LOG);
} }
/* /*
* walkdir: recursively walk a directory, applying the action to each * walkdir: recursively walk a directory, applying the action to each
* regular file and directory (including the named directory itself) * regular file and directory (including the named directory itself).
* and following symbolic links. *
* If process_symlinks is true, the action and recursion are also applied
* to regular files and directories that are pointed to by symlinks in the
* given directory; otherwise symlinks are ignored. Symlinks are always
* ignored in subdirectories, ie we intentionally don't pass down the
* process_symlinks flag to recursive calls.
* *
* NB: There is another version of walkdir in initdb.c, but that version * Errors are reported at level elevel, which might be ERROR or less.
* behaves differently with respect to symbolic links. Caveat emptor! *
* See also walkdir in initdb.c, which is a frontend version of this logic.
*/ */
void static void
walkdir(char *path, void (*action) (char *fname, bool isdir)) walkdir(const char *path,
void (*action) (const char *fname, bool isdir, int elevel),
bool process_symlinks,
int elevel)
{ {
DIR *dir; DIR *dir;
struct dirent *de; struct dirent *de;
dir = AllocateDir(path); dir = AllocateDir(path);
while ((de = ReadDir(dir, path)) != NULL) if (dir == NULL)
{
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not open directory \"%s\": %m", path)));
return;
}
while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
{ {
char subpath[MAXPGPATH]; char subpath[MAXPGPATH];
struct stat fst; struct stat fst;
int sret;
CHECK_FOR_INTERRUPTS(); CHECK_FOR_INTERRUPTS();
...@@ -2497,59 +2599,132 @@ walkdir(char *path, void (*action) (char *fname, bool isdir)) ...@@ -2497,59 +2599,132 @@ walkdir(char *path, void (*action) (char *fname, bool isdir))
snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name); snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
if (lstat(subpath, &fst) < 0) if (process_symlinks)
ereport(ERROR, sret = stat(subpath, &fst);
else
sret = lstat(subpath, &fst);
if (sret < 0)
{
ereport(elevel,
(errcode_for_file_access(), (errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m", subpath))); errmsg("could not stat file \"%s\": %m", subpath)));
continue;
}
if (S_ISREG(fst.st_mode)) if (S_ISREG(fst.st_mode))
(*action) (subpath, false); (*action) (subpath, false, elevel);
else if (S_ISDIR(fst.st_mode)) else if (S_ISDIR(fst.st_mode))
walkdir(subpath, action); walkdir(subpath, action, false, elevel);
#ifndef WIN32 }
else if (S_ISLNK(fst.st_mode))
#else FreeDir(dir); /* we ignore any error here */
else if (pgwin32_is_junction(subpath))
#endif /*
* It's important to fsync the destination directory itself as individual
* file fsyncs don't guarantee that the directory entry for the file is
* synced.
*/
(*action) (path, true, elevel);
}
/*
* Hint to the OS that it should get ready to fsync() this file.
*
* Ignores errors trying to open unreadable files, and logs other errors at a
* caller-specified level.
*/
#ifdef PG_FLUSH_DATA_WORKS
static void
pre_sync_fname(const char *fname, bool isdir, int elevel)
{
int fd;
fd = OpenTransientFile((char *) fname, O_RDONLY | PG_BINARY, 0);
if (fd < 0)
{ {
#if defined(HAVE_READLINK) || defined(WIN32) if (errno == EACCES || (isdir && errno == EISDIR))
char linkpath[MAXPGPATH]; return;
int len;
struct stat lst;
len = readlink(subpath, linkpath, sizeof(linkpath)); #ifdef ETXTBSY
if (len < 0) if (errno == ETXTBSY)
ereport(ERROR, return;
#endif
ereport(elevel,
(errcode_for_file_access(), (errcode_for_file_access(),
errmsg("could not read symbolic link \"%s\": %m", errmsg("could not open file \"%s\": %m", fname)));
subpath))); return;
if (len >= sizeof(linkpath)) }
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), (void) pg_flush_data(fd, 0, 0);
errmsg("symbolic link \"%s\" target is too long",
subpath)));
linkpath[len] = '\0';
if (lstat(linkpath, &lst) == 0) (void) CloseTransientFile(fd);
}
#endif /* PG_FLUSH_DATA_WORKS */
/*
* fsync_fname_ext -- Try to fsync a file or directory
*
* Ignores errors trying to open unreadable files, or trying to fsync
* directories on systems where that isn't allowed/required, and logs other
* errors at a caller-specified level.
*/
static void
fsync_fname_ext(const char *fname, bool isdir, int elevel)
{
int fd;
int flags;
int returncode;
/*
* Some OSs require directories to be opened read-only whereas other
* systems don't allow us to fsync files opened read-only; so we need both
* cases here. Using O_RDWR will cause us to fail to fsync files that are
* not writable by our userid, but we assume that's OK.
*/
flags = PG_BINARY;
if (!isdir)
flags |= O_RDWR;
else
flags |= O_RDONLY;
/*
* Open the file, silently ignoring errors about unreadable files (or
* unsupported operations, e.g. opening a directory under Windows), and
* logging others.
*/
fd = OpenTransientFile((char *) fname, flags, 0);
if (fd < 0)
{ {
if (S_ISREG(lst.st_mode)) if (errno == EACCES || (isdir && errno == EISDIR))
(*action) (linkpath, false); return;
else if (S_ISDIR(lst.st_mode))
walkdir(subpath, action); #ifdef ETXTBSY
} if (errno == ETXTBSY)
else if (errno != ENOENT) return;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m", linkpath)));
#else
ereport(WARNING,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("this platform does not support symbolic links; ignoring \"%s\"",
subpath)));
#endif #endif
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", fname)));
return;
} }
}
FreeDir(dir);
(*action) (path, true); returncode = pg_fsync(fd);
/*
* Some OSes don't allow us to fsync directories at all, so we can ignore
* those errors. Anything else needs to be logged.
*/
if (returncode != 0 && !(isdir && errno == EBADF))
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", fname)));
(void) CloseTransientFile(fd);
} }
...@@ -114,8 +114,7 @@ extern int pg_fsync_writethrough(int fd); ...@@ -114,8 +114,7 @@ extern int pg_fsync_writethrough(int fd);
extern int pg_fdatasync(int fd); extern int pg_fdatasync(int fd);
extern int pg_flush_data(int fd, off_t offset, off_t amount); extern int pg_flush_data(int fd, off_t offset, off_t amount);
extern void fsync_fname(char *fname, bool isdir); extern void fsync_fname(char *fname, bool isdir);
extern void pre_sync_fname(char *fname, bool isdir); extern void SyncDataDirectory(void);
extern void walkdir(char *path, void (*action) (char *fname, bool isdir));
/* Filename components for OpenTemporaryFile */ /* Filename components for OpenTemporaryFile */
#define PG_TEMP_FILES_DIR "pgsql_tmp" #define PG_TEMP_FILES_DIR "pgsql_tmp"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment