Commit f8c183a1 authored by Greg Stark's avatar Greg Stark

Speed up CREATE DATABASE by deferring the fsyncs until after copying

all the data and using posix_fadvise to nudge the OS into flushing it
earlier. This also hopefully makes CREATE DATABASE avoid spamming the
cache.

Tests show a big speedup on Linux at least on some filesystems.

Idea and patch from Andres Freund.
parent e26c539e
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.153 2010/01/12 02:42:52 momjian Exp $
* $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.154 2010/02/15 00:50:57 stark Exp $
*
* NOTES:
*
......@@ -319,6 +319,22 @@ pg_fdatasync(int fd)
return 0;
}
/*
* pg_flush_data --- advise OS that the data described won't be needed soon
*
* Not all platforms have posix_fadvise; treat as noop if not available.
*/
int
pg_flush_data(int fd, off_t offset, off_t amount)
{
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED);
#else
return 0;
#endif
}
/*
* InitFileAccess --- initialize this module during backend startup
*
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.66 2010/01/02 16:58:08 momjian Exp $
* $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.67 2010/02/15 00:50:57 stark Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -98,6 +98,7 @@ extern int pg_fsync(int fd);
extern int pg_fsync_no_writethrough(int fd);
extern int pg_fsync_writethrough(int fd);
extern int pg_fdatasync(int fd);
extern int pg_flush_data(int fd, off_t offset, off_t amount);
/* Filename components for OpenTemporaryFile */
#define PG_TEMP_FILES_DIR "pgsql_tmp"
......
......@@ -11,7 +11,7 @@
* as a service.
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/port/copydir.c,v 1.25 2010/02/14 17:50:52 stark Exp $
* $PostgreSQL: pgsql/src/port/copydir.c,v 1.26 2010/02/15 00:50:57 stark Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -37,6 +37,7 @@
static void copy_file(char *fromfile, char *tofile);
static void fsync_fname(char *fname);
/*
......@@ -91,27 +92,32 @@ copydir(char *fromdir, char *todir, bool recurse)
copy_file(fromfile, tofile);
}
FreeDir(xldir);
/*
* fsync the directory to make sure not just the data but also the
* new directory file entries have reached the disk. While needed
* by most filesystems, the window got bigger with newer ones like
* ext4.
* Be paranoid here and fsync all files to ensure we catch problems.
*/
dirfd = BasicOpenFile(todir,
O_RDONLY | PG_BINARY,
S_IRUSR | S_IWUSR);
if(dirfd == -1)
if (xldir == NULL)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open directory for fsync \"%s\": %m", todir)));
errmsg("could not open directory \"%s\": %m", fromdir)));
if(pg_fsync(dirfd) == -1)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not fsync directory \"%s\": %m", todir)));
close(dirfd);
while ((xlde = ReadDir(xldir, fromdir)) != NULL)
{
if (strcmp(xlde->d_name, ".") == 0 ||
strcmp(xlde->d_name, "..") == 0)
continue;
snprintf(tofile, MAXPGPATH, "%s/%s", todir, xlde->d_name);
fsync_fname(tofile);
}
FreeDir(xldir);
/* It's important to fsync the destination directory itself as
* individual file fsyncs don't guarantee that the directory entry
* for the file is synced. Recent versions of ext4 have made the
* window much wider but it's been true for ext3 and other
* filesyetems in the past
*/
fsync_fname(todir);
}
/*
......@@ -124,6 +130,7 @@ copy_file(char *fromfile, char *tofile)
int srcfd;
int dstfd;
int nbytes;
off_t offset;
/* Use palloc to ensure we get a maxaligned buffer */
#define COPY_BUF_SIZE (8 * BLCKSZ)
......@@ -149,7 +156,7 @@ copy_file(char *fromfile, char *tofile)
/*
* Do the data copying.
*/
for (;;)
for (offset=0; ; offset+=nbytes)
{
nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
if (nbytes < 0)
......@@ -168,15 +175,14 @@ copy_file(char *fromfile, char *tofile)
(errcode_for_file_access(),
errmsg("could not write to file \"%s\": %m", tofile)));
}
}
/*
* Be paranoid here to ensure we catch problems.
* We fsync the files later but first flush them to avoid spamming
* the cache and hopefully get the kernel to start writing them
* out before the fsync comes.
*/
if (pg_fsync(dstfd) != 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", tofile)));
pg_flush_data(dstfd, offset, nbytes);
}
if (close(dstfd))
ereport(ERROR,
......@@ -187,3 +193,27 @@ copy_file(char *fromfile, char *tofile)
pfree(buffer);
}
/*
* fsync a file
*/
static void
fsync_fname(char *fname)
{
int fd = BasicOpenFile(fname,
O_RDONLY | PG_BINARY,
S_IRUSR | S_IWUSR);
if (fd < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", fname)));
if (pg_fsync(fd) != 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", fname)));
close(fd);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment