Commit f8c183a1 authored by Greg Stark's avatar Greg Stark

Speed up CREATE DATABASE by deferring the fsyncs until after copying

all the data and using posix_fadvise to nudge the OS into flushing it
earlier. This also hopefully makes CREATE DATABASE avoid spamming the
cache.

Tests show a big speedup on Linux at least on some filesystems.

Idea and patch from Andres Freund.
parent e26c539e
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.153 2010/01/12 02:42:52 momjian Exp $ * $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.154 2010/02/15 00:50:57 stark Exp $
* *
* NOTES: * NOTES:
* *
...@@ -319,6 +319,22 @@ pg_fdatasync(int fd) ...@@ -319,6 +319,22 @@ pg_fdatasync(int fd)
return 0; return 0;
} }
/*
* pg_flush_data --- advise OS that the data described won't be needed soon
*
* Not all platforms have posix_fadvise; treat as noop if not available.
*/
int
pg_flush_data(int fd, off_t offset, off_t amount)
{
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED);
#else
return 0;
#endif
}
/* /*
* InitFileAccess --- initialize this module during backend startup * InitFileAccess --- initialize this module during backend startup
* *
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.66 2010/01/02 16:58:08 momjian Exp $ * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.67 2010/02/15 00:50:57 stark Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -98,6 +98,7 @@ extern int pg_fsync(int fd); ...@@ -98,6 +98,7 @@ extern int pg_fsync(int fd);
extern int pg_fsync_no_writethrough(int fd); extern int pg_fsync_no_writethrough(int fd);
extern int pg_fsync_writethrough(int fd); extern int pg_fsync_writethrough(int fd);
extern int pg_fdatasync(int fd); extern int pg_fdatasync(int fd);
extern int pg_flush_data(int fd, off_t offset, off_t amount);
/* Filename components for OpenTemporaryFile */ /* Filename components for OpenTemporaryFile */
#define PG_TEMP_FILES_DIR "pgsql_tmp" #define PG_TEMP_FILES_DIR "pgsql_tmp"
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
* as a service. * as a service.
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/port/copydir.c,v 1.25 2010/02/14 17:50:52 stark Exp $ * $PostgreSQL: pgsql/src/port/copydir.c,v 1.26 2010/02/15 00:50:57 stark Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -37,6 +37,7 @@ ...@@ -37,6 +37,7 @@
static void copy_file(char *fromfile, char *tofile); static void copy_file(char *fromfile, char *tofile);
static void fsync_fname(char *fname);
/* /*
...@@ -91,27 +92,32 @@ copydir(char *fromdir, char *todir, bool recurse) ...@@ -91,27 +92,32 @@ copydir(char *fromdir, char *todir, bool recurse)
copy_file(fromfile, tofile); copy_file(fromfile, tofile);
} }
FreeDir(xldir);
/* /*
* fsync the directory to make sure not just the data but also the * Be paranoid here and fsync all files to ensure we catch problems.
* new directory file entries have reached the disk. While needed
* by most filesystems, the window got bigger with newer ones like
* ext4.
*/ */
dirfd = BasicOpenFile(todir, if (xldir == NULL)
O_RDONLY | PG_BINARY,
S_IRUSR | S_IWUSR);
if(dirfd == -1)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open directory for fsync \"%s\": %m", todir)));
if(pg_fsync(dirfd) == -1)
ereport(ERROR, ereport(ERROR,
(errcode_for_file_access(), (errcode_for_file_access(),
errmsg("could not fsync directory \"%s\": %m", todir))); errmsg("could not open directory \"%s\": %m", fromdir)));
close(dirfd);
while ((xlde = ReadDir(xldir, fromdir)) != NULL)
{
if (strcmp(xlde->d_name, ".") == 0 ||
strcmp(xlde->d_name, "..") == 0)
continue;
snprintf(tofile, MAXPGPATH, "%s/%s", todir, xlde->d_name);
fsync_fname(tofile);
}
FreeDir(xldir);
/* It's important to fsync the destination directory itself as
* individual file fsyncs don't guarantee that the directory entry
* for the file is synced. Recent versions of ext4 have made the
* window much wider but it's been true for ext3 and other
* filesyetems in the past
*/
fsync_fname(todir);
} }
/* /*
...@@ -124,6 +130,7 @@ copy_file(char *fromfile, char *tofile) ...@@ -124,6 +130,7 @@ copy_file(char *fromfile, char *tofile)
int srcfd; int srcfd;
int dstfd; int dstfd;
int nbytes; int nbytes;
off_t offset;
/* Use palloc to ensure we get a maxaligned buffer */ /* Use palloc to ensure we get a maxaligned buffer */
#define COPY_BUF_SIZE (8 * BLCKSZ) #define COPY_BUF_SIZE (8 * BLCKSZ)
...@@ -149,7 +156,7 @@ copy_file(char *fromfile, char *tofile) ...@@ -149,7 +156,7 @@ copy_file(char *fromfile, char *tofile)
/* /*
* Do the data copying. * Do the data copying.
*/ */
for (;;) for (offset=0; ; offset+=nbytes)
{ {
nbytes = read(srcfd, buffer, COPY_BUF_SIZE); nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
if (nbytes < 0) if (nbytes < 0)
...@@ -168,15 +175,14 @@ copy_file(char *fromfile, char *tofile) ...@@ -168,15 +175,14 @@ copy_file(char *fromfile, char *tofile)
(errcode_for_file_access(), (errcode_for_file_access(),
errmsg("could not write to file \"%s\": %m", tofile))); errmsg("could not write to file \"%s\": %m", tofile)));
} }
}
/* /*
* Be paranoid here to ensure we catch problems. * We fsync the files later but first flush them to avoid spamming
*/ * the cache and hopefully get the kernel to start writing them
if (pg_fsync(dstfd) != 0) * out before the fsync comes.
ereport(ERROR, */
(errcode_for_file_access(), pg_flush_data(dstfd, offset, nbytes);
errmsg("could not fsync file \"%s\": %m", tofile))); }
if (close(dstfd)) if (close(dstfd))
ereport(ERROR, ereport(ERROR,
...@@ -187,3 +193,27 @@ copy_file(char *fromfile, char *tofile) ...@@ -187,3 +193,27 @@ copy_file(char *fromfile, char *tofile)
pfree(buffer); pfree(buffer);
} }
/*
* fsync a file
*/
static void
fsync_fname(char *fname)
{
int fd = BasicOpenFile(fname,
O_RDONLY | PG_BINARY,
S_IRUSR | S_IWUSR);
if (fd < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", fname)));
if (pg_fsync(fd) != 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", fname)));
close(fd);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment