Commit dc6c4c9d authored by Andres Freund's avatar Andres Freund

Add infrastructure for sharing temporary files between backends.

SharedFileSet allows temporary files to be created by one backend and
then exported for read-only access by other backends, with clean-up
managed by reference counting associated with a DSM segment.  This
includes changes to fd.c and buffile.c to support the new kind of
temporary file.

This will be used by an upcoming patch adding support for parallel
hash joins.

Author: Thomas Munro
Reviewed-By: Peter Geoghegan, Andres Freund, Robert Haas, Rushabh Lathia
Discussion:
    https://postgr.es/m/CAEepm=2W=cOkiZxcg6qiFQP-dHUe09aqTrEMM7yJDrHMhDv_RA@mail.gmail.com
    https://postgr.es/m/CAH2-WznJ_UgLux=_jTgCQ4yFz0iBntudsNKa1we3kN1BAG=88w@mail.gmail.com
parent 35438e57
...@@ -12,6 +12,6 @@ subdir = src/backend/storage/file ...@@ -12,6 +12,6 @@ subdir = src/backend/storage/file
top_builddir = ../../../.. top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global include $(top_builddir)/src/Makefile.global
OBJS = fd.o buffile.o copydir.o reinit.o OBJS = fd.o buffile.o copydir.o reinit.o sharedfileset.o
include $(top_srcdir)/src/backend/common.mk include $(top_srcdir)/src/backend/common.mk
...@@ -31,12 +31,18 @@ ...@@ -31,12 +31,18 @@
* BufFile also supports temporary files that exceed the OS file size limit * BufFile also supports temporary files that exceed the OS file size limit
* (by opening multiple fd.c temporary files). This is an essential feature * (by opening multiple fd.c temporary files). This is an essential feature
* for sorts and hashjoins on large amounts of data. * for sorts and hashjoins on large amounts of data.
*
* BufFile supports temporary files that can be made read-only and shared with
* other backends, as infrastructure for parallel execution. Such files need
* to be created as a member of a SharedFileSet that all participants are
* attached to.
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
#include "postgres.h" #include "postgres.h"
#include "executor/instrument.h" #include "executor/instrument.h"
#include "miscadmin.h"
#include "pgstat.h" #include "pgstat.h"
#include "storage/fd.h" #include "storage/fd.h"
#include "storage/buffile.h" #include "storage/buffile.h"
...@@ -70,6 +76,10 @@ struct BufFile ...@@ -70,6 +76,10 @@ struct BufFile
bool isInterXact; /* keep open over transactions? */ bool isInterXact; /* keep open over transactions? */
bool dirty; /* does buffer need to be written? */ bool dirty; /* does buffer need to be written? */
bool readOnly; /* has the file been set to read only? */
SharedFileSet *fileset; /* space for segment files if shared */
const char *name; /* name of this BufFile if shared */
/* /*
* resowner is the ResourceOwner to use for underlying temp files. (We * resowner is the ResourceOwner to use for underlying temp files. (We
...@@ -94,6 +104,7 @@ static void extendBufFile(BufFile *file); ...@@ -94,6 +104,7 @@ static void extendBufFile(BufFile *file);
static void BufFileLoadBuffer(BufFile *file); static void BufFileLoadBuffer(BufFile *file);
static void BufFileDumpBuffer(BufFile *file); static void BufFileDumpBuffer(BufFile *file);
static int BufFileFlush(BufFile *file); static int BufFileFlush(BufFile *file);
static File MakeNewSharedSegment(BufFile *file, int segment);
/* /*
...@@ -117,6 +128,9 @@ makeBufFile(File firstfile) ...@@ -117,6 +128,9 @@ makeBufFile(File firstfile)
file->curOffset = 0L; file->curOffset = 0L;
file->pos = 0; file->pos = 0;
file->nbytes = 0; file->nbytes = 0;
file->readOnly = false;
file->fileset = NULL;
file->name = NULL;
return file; return file;
} }
...@@ -134,7 +148,11 @@ extendBufFile(BufFile *file) ...@@ -134,7 +148,11 @@ extendBufFile(BufFile *file)
oldowner = CurrentResourceOwner; oldowner = CurrentResourceOwner;
CurrentResourceOwner = file->resowner; CurrentResourceOwner = file->resowner;
if (file->fileset == NULL)
pfile = OpenTemporaryFile(file->isInterXact); pfile = OpenTemporaryFile(file->isInterXact);
else
pfile = MakeNewSharedSegment(file, file->numFiles);
Assert(pfile >= 0); Assert(pfile >= 0);
CurrentResourceOwner = oldowner; CurrentResourceOwner = oldowner;
...@@ -175,6 +193,189 @@ BufFileCreateTemp(bool interXact) ...@@ -175,6 +193,189 @@ BufFileCreateTemp(bool interXact)
return file; return file;
} }
/*
* Build the name for a given segment of a given BufFile.
*/
static void
SharedSegmentName(char *name, const char *buffile_name, int segment)
{
snprintf(name, MAXPGPATH, "%s.%d", buffile_name, segment);
}
/*
* Create a new segment file backing a shared BufFile.
*/
static File
MakeNewSharedSegment(BufFile *buffile, int segment)
{
char name[MAXPGPATH];
File file;
SharedSegmentName(name, buffile->name, segment);
file = SharedFileSetCreate(buffile->fileset, name);
/* SharedFileSetCreate would've errored out */
Assert(file > 0);
return file;
}
/*
* Create a BufFile that can be discovered and opened read-only by other
* backends that are attached to the same SharedFileSet using the same name.
*
* The naming scheme for shared BufFiles is left up to the calling code. The
* name will appear as part of one or more filenames on disk, and might
* provide clues to administrators about which subsystem is generating
* temporary file data. Since each SharedFileSet object is backed by one or
* more uniquely named temporary directory, names don't conflict with
* unrelated SharedFileSet objects.
*/
BufFile *
BufFileCreateShared(SharedFileSet *fileset, const char *name)
{
BufFile *file;
file = (BufFile *) palloc(sizeof(BufFile));
file->fileset = fileset;
file->name = pstrdup(name);
file->numFiles = 1;
file->files = (File *) palloc(sizeof(File));
file->files[0] = MakeNewSharedSegment(file, 0);
file->offsets = (off_t *) palloc(sizeof(off_t));
file->offsets[0] = 0L;
file->isInterXact = false;
file->dirty = false;
file->resowner = CurrentResourceOwner;
file->curFile = 0;
file->curOffset = 0L;
file->pos = 0;
file->nbytes = 0;
file->readOnly = false;
file->name = pstrdup(name);
return file;
}
/*
* Open a file that was previously created in another backend (or this one)
* with BufFileCreateShared in the same SharedFileSet using the same name.
* The backend that created the file must have called BufFileClose() or
* BufFileExport() to make sure that it is ready to be opened by other
* backends and render it read-only.
*/
BufFile *
BufFileOpenShared(SharedFileSet *fileset, const char *name)
{
BufFile *file = (BufFile *) palloc(sizeof(BufFile));
char segment_name[MAXPGPATH];
Size capacity = 16;
File *files = palloc(sizeof(File) * capacity);
int nfiles = 0;
file = (BufFile *) palloc(sizeof(BufFile));
files = palloc(sizeof(File) * capacity);
/*
* We don't know how many segments there are, so we'll probe the
* filesystem to find out.
*/
for (;;)
{
/* See if we need to expand our file segment array. */
if (nfiles + 1 > capacity)
{
capacity *= 2;
files = repalloc(files, sizeof(File) * capacity);
}
/* Try to load a segment. */
SharedSegmentName(segment_name, name, nfiles);
files[nfiles] = SharedFileSetOpen(fileset, segment_name);
if (files[nfiles] <= 0)
break;
++nfiles;
CHECK_FOR_INTERRUPTS();
}
/*
* If we didn't find any files at all, then no BufFile exists with this
* name.
*/
if (nfiles == 0)
return NULL;
file->numFiles = nfiles;
file->files = files;
file->offsets = (off_t *) palloc0(sizeof(off_t) * nfiles);
file->isInterXact = false;
file->dirty = false;
file->resowner = CurrentResourceOwner; /* Unused, can't extend */
file->curFile = 0;
file->curOffset = 0L;
file->pos = 0;
file->nbytes = 0;
file->readOnly = true; /* Can't write to files opened this way */
file->fileset = fileset;
file->name = pstrdup(name);
return file;
}
/*
* Delete a BufFile that was created by BufFileCreateShared in the given
* SharedFileSet using the given name.
*
* It is not necessary to delete files explicitly with this function. It is
* provided only as a way to delete files proactively, rather than waiting for
* the SharedFileSet to be cleaned up.
*
* Only one backend should attempt to delete a given name, and should know
* that it exists and has been exported or closed.
*/
void
BufFileDeleteShared(SharedFileSet *fileset, const char *name)
{
char segment_name[MAXPGPATH];
int segment = 0;
bool found = false;
/*
* We don't know how many segments the file has. We'll keep deleting
* until we run out. If we don't manage to find even an initial segment,
* raise an error.
*/
for (;;)
{
SharedSegmentName(segment_name, name, segment);
if (!SharedFileSetDelete(fileset, segment_name, true))
break;
found = true;
++segment;
CHECK_FOR_INTERRUPTS();
}
if (!found)
elog(ERROR, "could not delete unknown shared BufFile \"%s\"", name);
}
/*
* BufFileExportShared --- flush and make read-only, in preparation for sharing.
*/
void
BufFileExportShared(BufFile *file)
{
/* Must be a file belonging to a SharedFileSet. */
Assert(file->fileset != NULL);
/* It's probably a bug if someone calls this twice. */
Assert(!file->readOnly);
BufFileFlush(file);
file->readOnly = true;
}
/* /*
* Close a BufFile * Close a BufFile
* *
...@@ -390,6 +591,8 @@ BufFileWrite(BufFile *file, void *ptr, size_t size) ...@@ -390,6 +591,8 @@ BufFileWrite(BufFile *file, void *ptr, size_t size)
size_t nwritten = 0; size_t nwritten = 0;
size_t nthistime; size_t nthistime;
Assert(!file->readOnly);
while (size > 0) while (size > 0)
{ {
if (file->pos >= BLCKSZ) if (file->pos >= BLCKSZ)
......
...@@ -39,6 +39,14 @@ ...@@ -39,6 +39,14 @@
* for a long time, like relation files. It is the caller's responsibility * for a long time, like relation files. It is the caller's responsibility
* to close them, there is no automatic mechanism in fd.c for that. * to close them, there is no automatic mechanism in fd.c for that.
* *
* PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
* temporary files that have names so that they can be shared between
* backends. Such files are automatically closed and count against the
* temporary file limit of the backend that creates them, but unlike anonymous
* files they are not automatically deleted. See sharedfileset.c for a shared
* ownership mechanism that provides automatic cleanup for shared files when
* the last of a group of backends detaches.
*
* AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
* wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively. * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
* They behave like the corresponding native functions, except that the handle * They behave like the corresponding native functions, except that the handle
...@@ -175,8 +183,9 @@ int max_safe_fds = 32; /* default if not changed */ ...@@ -175,8 +183,9 @@ int max_safe_fds = 32; /* default if not changed */
#define FilePosIsUnknown(pos) ((pos) < 0) #define FilePosIsUnknown(pos) ((pos) < 0)
/* these are the assigned bits in fdstate below: */ /* these are the assigned bits in fdstate below: */
#define FD_TEMPORARY (1 << 0) /* T = delete when closed */ #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
#define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */ #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
#define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
typedef struct vfd typedef struct vfd
{ {
...@@ -313,7 +322,7 @@ static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel) ...@@ -313,7 +322,7 @@ static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel)
static void AtProcExit_Files(int code, Datum arg); static void AtProcExit_Files(int code, Datum arg);
static void CleanupTempFiles(bool isProcExit); static void CleanupTempFiles(bool isProcExit);
static void RemovePgTempFilesInDir(const char *tmpdirname); static void RemovePgTempFilesInDir(const char *tmpdirname, bool unlink_all);
static void RemovePgTempRelationFiles(const char *tsdirname); static void RemovePgTempRelationFiles(const char *tsdirname);
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname); static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
static bool looks_like_temp_rel_name(const char *name); static bool looks_like_temp_rel_name(const char *name);
...@@ -326,6 +335,7 @@ static void walkdir(const char *path, ...@@ -326,6 +335,7 @@ static void walkdir(const char *path,
static void pre_sync_fname(const char *fname, bool isdir, int elevel); static void pre_sync_fname(const char *fname, bool isdir, int elevel);
#endif #endif
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel); static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel); static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
static int fsync_parent_path(const char *fname, int elevel); static int fsync_parent_path(const char *fname, int elevel);
...@@ -1294,6 +1304,39 @@ FileAccess(File file) ...@@ -1294,6 +1304,39 @@ FileAccess(File file)
return 0; return 0;
} }
/*
* Called whenever a temporary file is deleted to report its size.
*/
static void
ReportTemporaryFileUsage(const char *path, off_t size)
{
pgstat_report_tempfile(size);
if (log_temp_files >= 0)
{
if ((size / 1024) >= log_temp_files)
ereport(LOG,
(errmsg("temporary file: path \"%s\", size %lu",
path, (unsigned long) size)));
}
}
/*
* Called to register a temporary file for automatic close.
* ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
* before the file was opened.
*/
static void
RegisterTemporaryFile(File file)
{
ResourceOwnerRememberFile(CurrentResourceOwner, file);
VfdCache[file].resowner = CurrentResourceOwner;
/* Backup mechanism for closing at end of xact. */
VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
have_xact_temporary_files = true;
}
/* /*
* Called when we get a shared invalidation message on some relation. * Called when we get a shared invalidation message on some relation.
*/ */
...@@ -1378,6 +1421,67 @@ PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode) ...@@ -1378,6 +1421,67 @@ PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
return file; return file;
} }
/*
* Create directory 'directory'. If necessary, create 'basedir', which must
* be the directory above it. This is designed for creating the top-level
* temporary directory on demand before creating a directory underneath it.
* Do nothing if the directory already exists.
*
* Directories created within the top-level temporary directory should begin
* with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
* deleted at startup by RemovePgTempFiles(). Further subdirectories below
* that do not need any particular prefix.
*/
void
PathNameCreateTemporaryDir(const char *basedir, const char *directory)
{
if (mkdir(directory, S_IRWXU) < 0)
{
if (errno == EEXIST)
return;
/*
* Failed. Try to create basedir first in case it's missing. Tolerate
* EEXIST to close a race against another process following the same
* algorithm.
*/
if (mkdir(basedir, S_IRWXU) < 0 && errno != EEXIST)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("cannot create temporary directory \"%s\": %m",
basedir)));
/* Try again. */
if (mkdir(directory, S_IRWXU) < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("cannot create temporary subdirectory \"%s\": %m",
directory)));
}
}
/*
* Delete a directory and everything in it, if it exists.
*/
void
PathNameDeleteTemporaryDir(const char *dirname)
{
struct stat statbuf;
/* Silently ignore missing directory. */
if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
return;
/*
* Currently, walkdir doesn't offer a way for our passed in function to
* maintain state. Perhaps it should, so that we could tell the caller
* whether this operation succeeded or failed. Since this operation is
* used in a cleanup path, we wouldn't actually behave differently: we'll
* just log failures.
*/
walkdir(dirname, unlink_if_exists_fname, false, LOG);
}
/* /*
* Open a temporary file that will disappear when we close it. * Open a temporary file that will disappear when we close it.
* *
...@@ -1432,53 +1536,52 @@ OpenTemporaryFile(bool interXact) ...@@ -1432,53 +1536,52 @@ OpenTemporaryFile(bool interXact)
DEFAULTTABLESPACE_OID, DEFAULTTABLESPACE_OID,
true); true);
/* Mark it for deletion at close */ /* Mark it for deletion at close and temporary file size limit */
VfdCache[file].fdstate |= FD_TEMPORARY; VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
/* Register it with the current resource owner */ /* Register it with the current resource owner */
if (!interXact) if (!interXact)
{ RegisterTemporaryFile(file);
VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
VfdCache[file].resowner = CurrentResourceOwner;
ResourceOwnerRememberFile(CurrentResourceOwner, file);
/* ensure cleanup happens at eoxact */
have_xact_temporary_files = true;
}
return file; return file;
} }
/* /*
* Open a temporary file in a specific tablespace. * Return the path of the temp directory in a given tablespace.
* Subroutine for OpenTemporaryFile, which see for details.
*/ */
static File void
OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError) TempTablespacePath(char *path, Oid tablespace)
{ {
char tempdirpath[MAXPGPATH];
char tempfilepath[MAXPGPATH];
File file;
/* /*
* Identify the tempfile directory for this tablespace. * Identify the tempfile directory for this tablespace.
* *
* If someone tries to specify pg_global, use pg_default instead. * If someone tries to specify pg_global, use pg_default instead.
*/ */
if (tblspcOid == DEFAULTTABLESPACE_OID || if (tablespace == InvalidOid ||
tblspcOid == GLOBALTABLESPACE_OID) tablespace == DEFAULTTABLESPACE_OID ||
{ tablespace == GLOBALTABLESPACE_OID)
/* The default tablespace is {datadir}/base */ snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
PG_TEMP_FILES_DIR);
}
else else
{ {
/* All other tablespaces are accessed via symlinks */ /* All other tablespaces are accessed via symlinks */
snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s/%s", snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
tblspcOid, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR); tablespace, TABLESPACE_VERSION_DIRECTORY,
PG_TEMP_FILES_DIR);
} }
}
/*
* Open a temporary file in a specific tablespace.
* Subroutine for OpenTemporaryFile, which see for details.
*/
static File
OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
{
char tempdirpath[MAXPGPATH];
char tempfilepath[MAXPGPATH];
File file;
TempTablespacePath(tempdirpath, tblspcOid);
/* /*
* Generate a tempfile name that should be unique within the current * Generate a tempfile name that should be unique within the current
...@@ -1515,6 +1618,130 @@ OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError) ...@@ -1515,6 +1618,130 @@ OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
return file; return file;
} }
/*
* Create a new file. The directory containing it must already exist. Files
* created this way are subject to temp_file_limit and are automatically
* closed at end of transaction, but are not automatically deleted on close
* because they are intended to be shared between cooperating backends.
*
* If the file is inside the top-level temporary directory, its name should
* begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
* and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
* inside a directory created with PathnameCreateTemporaryDir(), in which case
* the prefix isn't needed.
*/
File
PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
{
File file;
ResourceOwnerEnlargeFiles(CurrentResourceOwner);
/*
* Open the file. Note: we don't use O_EXCL, in case there is an orphaned
* temp file that can be reused.
*/
file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
if (file <= 0)
{
if (error_on_failure)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not create temporary file \"%s\": %m",
path)));
else
return file;
}
/* Mark it for temp_file_limit accounting. */
VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
/* Register it for automatic close. */
RegisterTemporaryFile(file);
return file;
}
/*
* Open a file that was created with PathNameCreateTemporaryFile, possibly in
* another backend. Files opened this way don't count against the
* temp_file_limit of the caller, are read-only and are automatically closed
* at the end of the transaction but are not deleted on close.
*/
File
PathNameOpenTemporaryFile(const char *path)
{
File file;
ResourceOwnerEnlargeFiles(CurrentResourceOwner);
/* We open the file read-only. */
file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
/* If no such file, then we don't raise an error. */
if (file <= 0 && errno != ENOENT)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open temporary file \"%s\": %m",
path)));
if (file > 0)
{
/* Register it for automatic close. */
RegisterTemporaryFile(file);
}
return file;
}
/*
* Delete a file by pathname. Return true if the file existed, false if
* didn't.
*/
bool
PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
{
struct stat filestats;
int stat_errno;
/* Get the final size for pgstat reporting. */
if (stat(path, &filestats) != 0)
stat_errno = errno;
else
stat_errno = 0;
/*
* Unlike FileClose's automatic file deletion code, we tolerate
* non-existence to support BufFileDeleteShared which doesn't know how
* many segments it has to delete until it runs out.
*/
if (stat_errno == ENOENT)
return false;
if (unlink(path) < 0)
{
if (errno != ENOENT)
ereport(error_on_failure ? ERROR : LOG,
(errcode_for_file_access(),
errmsg("cannot unlink temporary file \"%s\": %m",
path)));
return false;
}
if (stat_errno == 0)
ReportTemporaryFileUsage(path, filestats.st_size);
else
{
errno = stat_errno;
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m", path)));
}
return true;
}
/* /*
* close a file when done with it * close a file when done with it
*/ */
...@@ -1543,10 +1770,17 @@ FileClose(File file) ...@@ -1543,10 +1770,17 @@ FileClose(File file)
Delete(file); Delete(file);
} }
if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
{
/* Subtract its size from current usage (do first in case of error) */
temporary_files_size -= vfdP->fileSize;
vfdP->fileSize = 0;
}
/* /*
* Delete the file if it was temporary, and make a log entry if wanted * Delete the file if it was temporary, and make a log entry if wanted
*/ */
if (vfdP->fdstate & FD_TEMPORARY) if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
{ {
struct stat filestats; struct stat filestats;
int stat_errno; int stat_errno;
...@@ -1558,11 +1792,8 @@ FileClose(File file) ...@@ -1558,11 +1792,8 @@ FileClose(File file)
* is arranged to ensure that the worst-case consequence is failing to * is arranged to ensure that the worst-case consequence is failing to
* emit log message(s), not failing to attempt the unlink. * emit log message(s), not failing to attempt the unlink.
*/ */
vfdP->fdstate &= ~FD_TEMPORARY; vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
/* Subtract its size from current usage (do first in case of error) */
temporary_files_size -= vfdP->fileSize;
vfdP->fileSize = 0;
/* first try the stat() */ /* first try the stat() */
if (stat(vfdP->fileName, &filestats)) if (stat(vfdP->fileName, &filestats))
...@@ -1576,18 +1807,7 @@ FileClose(File file) ...@@ -1576,18 +1807,7 @@ FileClose(File file)
/* and last report the stat results */ /* and last report the stat results */
if (stat_errno == 0) if (stat_errno == 0)
{ ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
pgstat_report_tempfile(filestats.st_size);
if (log_temp_files >= 0)
{
if ((filestats.st_size / 1024) >= log_temp_files)
ereport(LOG,
(errmsg("temporary file: path \"%s\", size %lu",
vfdP->fileName,
(unsigned long) filestats.st_size)));
}
}
else else
{ {
errno = stat_errno; errno = stat_errno;
...@@ -1761,7 +1981,7 @@ FileWrite(File file, char *buffer, int amount, uint32 wait_event_info) ...@@ -1761,7 +1981,7 @@ FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
* message if we do that. All current callers would just throw error * message if we do that. All current callers would just throw error
* immediately anyway, so this is safe at present. * immediately anyway, so this is safe at present.
*/ */
if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMPORARY)) if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
{ {
off_t newPos; off_t newPos;
...@@ -1814,7 +2034,7 @@ retry: ...@@ -1814,7 +2034,7 @@ retry:
* get here in that state if we're not enforcing temporary_files_size, * get here in that state if we're not enforcing temporary_files_size,
* so we don't care. * so we don't care.
*/ */
if (vfdP->fdstate & FD_TEMPORARY) if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
{ {
off_t newPos = vfdP->seekPos; off_t newPos = vfdP->seekPos;
...@@ -1985,7 +2205,7 @@ FileTruncate(File file, off_t offset, uint32 wait_event_info) ...@@ -1985,7 +2205,7 @@ FileTruncate(File file, off_t offset, uint32 wait_event_info)
if (returnCode == 0 && VfdCache[file].fileSize > offset) if (returnCode == 0 && VfdCache[file].fileSize > offset)
{ {
/* adjust our state for truncation of a temp file */ /* adjust our state for truncation of a temp file */
Assert(VfdCache[file].fdstate & FD_TEMPORARY); Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
temporary_files_size -= VfdCache[file].fileSize - offset; temporary_files_size -= VfdCache[file].fileSize - offset;
VfdCache[file].fileSize = offset; VfdCache[file].fileSize = offset;
} }
...@@ -2593,6 +2813,24 @@ TempTablespacesAreSet(void) ...@@ -2593,6 +2813,24 @@ TempTablespacesAreSet(void)
return (numTempTableSpaces >= 0); return (numTempTableSpaces >= 0);
} }
/*
* GetTempTablespaces
*
* Populate an array with the OIDs of the tablespaces that should be used for
* temporary files. Return the number that were copied into the output array.
*/
int
GetTempTablespaces(Oid *tableSpaces, int numSpaces)
{
int i;
Assert(TempTablespacesAreSet());
for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
tableSpaces[i] = tempTableSpaces[i];
return i;
}
/* /*
* GetNextTempTableSpace * GetNextTempTableSpace
* *
...@@ -2696,7 +2934,8 @@ CleanupTempFiles(bool isProcExit) ...@@ -2696,7 +2934,8 @@ CleanupTempFiles(bool isProcExit)
{ {
unsigned short fdstate = VfdCache[i].fdstate; unsigned short fdstate = VfdCache[i].fdstate;
if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL) if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
VfdCache[i].fileName != NULL)
{ {
/* /*
* If we're in the process of exiting a backend process, close * If we're in the process of exiting a backend process, close
...@@ -2707,7 +2946,7 @@ CleanupTempFiles(bool isProcExit) ...@@ -2707,7 +2946,7 @@ CleanupTempFiles(bool isProcExit)
*/ */
if (isProcExit) if (isProcExit)
FileClose(i); FileClose(i);
else if (fdstate & FD_XACT_TEMPORARY) else if (fdstate & FD_CLOSE_AT_EOXACT)
{ {
elog(WARNING, elog(WARNING,
"temporary file %s not closed at end-of-transaction", "temporary file %s not closed at end-of-transaction",
...@@ -2751,7 +2990,7 @@ RemovePgTempFiles(void) ...@@ -2751,7 +2990,7 @@ RemovePgTempFiles(void)
* First process temp files in pg_default ($PGDATA/base) * First process temp files in pg_default ($PGDATA/base)
*/ */
snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR); snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
RemovePgTempFilesInDir(temp_path); RemovePgTempFilesInDir(temp_path, false);
RemovePgTempRelationFiles("base"); RemovePgTempRelationFiles("base");
/* /*
...@@ -2767,7 +3006,7 @@ RemovePgTempFiles(void) ...@@ -2767,7 +3006,7 @@ RemovePgTempFiles(void)
snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s", snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR); spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
RemovePgTempFilesInDir(temp_path); RemovePgTempFilesInDir(temp_path, false);
snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s", snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
spc_de->d_name, TABLESPACE_VERSION_DIRECTORY); spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
...@@ -2785,9 +3024,15 @@ RemovePgTempFiles(void) ...@@ -2785,9 +3024,15 @@ RemovePgTempFiles(void)
#endif #endif
} }
/* Process one pgsql_tmp directory for RemovePgTempFiles */ /*
* Process one pgsql_tmp directory for RemovePgTempFiles. At the top level in
* each tablespace, this should be called with unlink_all = false, so that
* only files matching the temporary name prefix will be unlinked. When
* recursing it will be called with unlink_all = true to unlink everything
* under a top-level temporary directory.
*/
static void static void
RemovePgTempFilesInDir(const char *tmpdirname) RemovePgTempFilesInDir(const char *tmpdirname, bool unlink_all)
{ {
DIR *temp_dir; DIR *temp_dir;
struct dirent *temp_de; struct dirent *temp_de;
...@@ -2813,10 +3058,25 @@ RemovePgTempFilesInDir(const char *tmpdirname) ...@@ -2813,10 +3058,25 @@ RemovePgTempFilesInDir(const char *tmpdirname)
snprintf(rm_path, sizeof(rm_path), "%s/%s", snprintf(rm_path, sizeof(rm_path), "%s/%s",
tmpdirname, temp_de->d_name); tmpdirname, temp_de->d_name);
if (strncmp(temp_de->d_name, if (unlink_all ||
strncmp(temp_de->d_name,
PG_TEMP_FILE_PREFIX, PG_TEMP_FILE_PREFIX,
strlen(PG_TEMP_FILE_PREFIX)) == 0) strlen(PG_TEMP_FILE_PREFIX)) == 0)
unlink(rm_path); /* note we ignore any error */ {
struct stat statbuf;
/* note that we ignore any error here and below */
if (lstat(rm_path, &statbuf) < 0)
continue;
if (S_ISDIR(statbuf.st_mode))
{
RemovePgTempFilesInDir(rm_path, true);
rmdir(rm_path);
}
else
unlink(rm_path);
}
else else
elog(LOG, elog(LOG,
"unexpected file found in temporary-files directory: \"%s\"", "unexpected file found in temporary-files directory: \"%s\"",
...@@ -3152,6 +3412,23 @@ datadir_fsync_fname(const char *fname, bool isdir, int elevel) ...@@ -3152,6 +3412,23 @@ datadir_fsync_fname(const char *fname, bool isdir, int elevel)
fsync_fname_ext(fname, isdir, true, elevel); fsync_fname_ext(fname, isdir, true, elevel);
} }
static void
unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
{
if (isdir)
{
if (rmdir(fname) != 0 && errno != ENOENT)
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not rmdir directory \"%s\": %m", fname)));
}
else
{
/* Use PathNameDeleteTemporaryFile to report filesize */
PathNameDeleteTemporaryFile(fname, false);
}
}
/* /*
* fsync_fname_ext -- Try to fsync a file or directory * fsync_fname_ext -- Try to fsync a file or directory
* *
......
/*-------------------------------------------------------------------------
*
* sharedfileset.c
* Shared temporary file management.
*
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/storage/file/sharedfileset.c
*
* SharefFileSets provide a temporary namespace (think directory) so that
* files can be discovered by name, and a shared ownership semantics so that
* shared files survive until the last user detaches.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/hash.h"
#include "catalog/pg_tablespace.h"
#include "commands/tablespace.h"
#include "miscadmin.h"
#include "storage/dsm.h"
#include "storage/sharedfileset.h"
#include "utils/builtins.h"
static void SharedFileSetOnDetach(dsm_segment *segment, Datum datum);
static void SharedFileSetPath(char *path, SharedFileSet *fileset, Oid tablespace);
static void SharedFilePath(char *path, SharedFileSet *fileset, const char *name);
static Oid ChooseTablespace(const SharedFileSet *fileset, const char *name);
/*
* Initialize a space for temporary files that can be opened for read-only
* access by other backends. Other backends must attach to it before
* accessing it. Associate this SharedFileSet with 'seg'. Any contained
* files will be deleted when the last backend detaches.
*
* Files will be distributed over the tablespaces configured in
* temp_tablespaces.
*
* Under the covers the set is one or more directories which will eventually
* be deleted when there are no backends attached.
*/
void
SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg)
{
static uint32 counter = 0;
SpinLockInit(&fileset->mutex);
fileset->refcnt = 1;
fileset->creator_pid = MyProcPid;
fileset->number = counter;
counter = (counter + 1) % INT_MAX;
/* Capture the tablespace OIDs so that all backends agree on them. */
PrepareTempTablespaces();
fileset->ntablespaces =
GetTempTablespaces(&fileset->tablespaces[0],
lengthof(fileset->tablespaces));
if (fileset->ntablespaces == 0)
{
fileset->tablespaces[0] = DEFAULTTABLESPACE_OID;
fileset->ntablespaces = 1;
}
/* Register our cleanup callback. */
on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset));
}
/*
* Attach to a set of directories that was created with SharedFileSetInit.
*/
void
SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg)
{
bool success;
SpinLockAcquire(&fileset->mutex);
if (fileset->refcnt == 0)
success = false;
else
{
++fileset->refcnt;
success = true;
}
SpinLockRelease(&fileset->mutex);
if (!success)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("could not attach to a SharedFileSet that is already destroyed")));
/* Register our cleanup callback. */
on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset));
}
/*
* Create a new file in the given set.
*/
File
SharedFileSetCreate(SharedFileSet *fileset, const char *name)
{
char path[MAXPGPATH];
File file;
SharedFilePath(path, fileset, name);
file = PathNameCreateTemporaryFile(path, false);
/* If we failed, see if we need to create the directory on demand. */
if (file <= 0)
{
char tempdirpath[MAXPGPATH];
char filesetpath[MAXPGPATH];
Oid tablespace = ChooseTablespace(fileset, name);
TempTablespacePath(tempdirpath, tablespace);
SharedFileSetPath(filesetpath, fileset, tablespace);
PathNameCreateTemporaryDir(tempdirpath, filesetpath);
file = PathNameCreateTemporaryFile(path, true);
}
return file;
}
/*
* Open a file that was created with SharedFileSetCreate(), possibly in
* another backend.
*/
File
SharedFileSetOpen(SharedFileSet *fileset, const char *name)
{
char path[MAXPGPATH];
File file;
SharedFilePath(path, fileset, name);
file = PathNameOpenTemporaryFile(path);
return file;
}
/*
* Delete a file that was created with PathNameCreateShared().
* Return true if the file existed, false if didn't.
*/
bool
SharedFileSetDelete(SharedFileSet *fileset, const char *name,
bool error_on_failure)
{
char path[MAXPGPATH];
SharedFilePath(path, fileset, name);
return PathNameDeleteTemporaryFile(path, error_on_failure);
}
/*
* Delete all files in the set.
*/
void
SharedFileSetDeleteAll(SharedFileSet *fileset)
{
char dirpath[MAXPGPATH];
int i;
/*
* Delete the directory we created in each tablespace. Doesn't fail
* because we use this in error cleanup paths, but can generate LOG
* message on IO error.
*/
for (i = 0; i < fileset->ntablespaces; ++i)
{
SharedFileSetPath(dirpath, fileset, fileset->tablespaces[i]);
PathNameDeleteTemporaryDir(dirpath);
}
}
/*
* Callback function that will be invoked when this backend detaches from a
* DSM segment holding a SharedFileSet that it has created or attached to. If
* we are the last to detach, then try to remove the directories and
* everything in them. We can't raise an error on failures, because this runs
* in error cleanup paths.
*/
static void
SharedFileSetOnDetach(dsm_segment *segment, Datum datum)
{
bool unlink_all = false;
SharedFileSet *fileset = (SharedFileSet *) DatumGetPointer(datum);
SpinLockAcquire(&fileset->mutex);
Assert(fileset->refcnt > 0);
if (--fileset->refcnt == 0)
unlink_all = true;
SpinLockRelease(&fileset->mutex);
/*
* If we are the last to detach, we delete the directory in all
* tablespaces. Note that we are still actually attached for the rest of
* this function so we can safely access its data.
*/
if (unlink_all)
SharedFileSetDeleteAll(fileset);
}
/*
* Build the path for the directory holding the files backing a SharedFileSet
* in a given tablespace.
*/
static void
SharedFileSetPath(char *path, SharedFileSet *fileset, Oid tablespace)
{
char tempdirpath[MAXPGPATH];
TempTablespacePath(tempdirpath, tablespace);
snprintf(path, MAXPGPATH, "%s/%s%d.%u.sharedfileset",
tempdirpath, PG_TEMP_FILE_PREFIX,
fileset->creator_pid, fileset->number);
}
/*
* Sorting hat to determine which tablespace a given shared temporary file
* belongs in.
*/
static Oid
ChooseTablespace(const SharedFileSet *fileset, const char *name)
{
uint32 hash = hash_any((const unsigned char *) name, strlen(name));
return fileset->tablespaces[hash % fileset->ntablespaces];
}
/*
* Compute the full path of a file in a SharedFileSet.
*/
static void
SharedFilePath(char *path, SharedFileSet *fileset, const char *name)
{
char dirpath[MAXPGPATH];
SharedFileSetPath(dirpath, fileset, ChooseTablespace(fileset, name));
snprintf(path, MAXPGPATH, "%s/%s", dirpath, name);
}
...@@ -26,6 +26,8 @@ ...@@ -26,6 +26,8 @@
#ifndef BUFFILE_H #ifndef BUFFILE_H
#define BUFFILE_H #define BUFFILE_H
#include "storage/sharedfileset.h"
/* BufFile is an opaque type whose details are not known outside buffile.c. */ /* BufFile is an opaque type whose details are not known outside buffile.c. */
typedef struct BufFile BufFile; typedef struct BufFile BufFile;
...@@ -42,4 +44,9 @@ extern int BufFileSeek(BufFile *file, int fileno, off_t offset, int whence); ...@@ -42,4 +44,9 @@ extern int BufFileSeek(BufFile *file, int fileno, off_t offset, int whence);
extern void BufFileTell(BufFile *file, int *fileno, off_t *offset); extern void BufFileTell(BufFile *file, int *fileno, off_t *offset);
extern int BufFileSeekBlock(BufFile *file, long blknum); extern int BufFileSeekBlock(BufFile *file, long blknum);
extern BufFile *BufFileCreateShared(SharedFileSet *fileset, const char *name);
extern void BufFileExportShared(BufFile *file);
extern BufFile *BufFileOpenShared(SharedFileSet *fileset, const char *name);
extern void BufFileDeleteShared(SharedFileSet *fileset, const char *name);
#endif /* BUFFILE_H */ #endif /* BUFFILE_H */
...@@ -79,6 +79,14 @@ extern int FileGetRawDesc(File file); ...@@ -79,6 +79,14 @@ extern int FileGetRawDesc(File file);
extern int FileGetRawFlags(File file); extern int FileGetRawFlags(File file);
extern mode_t FileGetRawMode(File file); extern mode_t FileGetRawMode(File file);
/* Operations used for sharing named temporary files */
extern File PathNameCreateTemporaryFile(const char *name, bool error_on_failure);
extern File PathNameOpenTemporaryFile(const char *name);
extern bool PathNameDeleteTemporaryFile(const char *name, bool error_on_failure);
extern void PathNameCreateTemporaryDir(const char *base, const char *name);
extern void PathNameDeleteTemporaryDir(const char *name);
extern void TempTablespacePath(char *path, Oid tablespace);
/* Operations that allow use of regular stdio --- USE WITH CAUTION */ /* Operations that allow use of regular stdio --- USE WITH CAUTION */
extern FILE *AllocateFile(const char *name, const char *mode); extern FILE *AllocateFile(const char *name, const char *mode);
extern int FreeFile(FILE *file); extern int FreeFile(FILE *file);
...@@ -107,6 +115,7 @@ extern void set_max_safe_fds(void); ...@@ -107,6 +115,7 @@ extern void set_max_safe_fds(void);
extern void closeAllVfds(void); extern void closeAllVfds(void);
extern void SetTempTablespaces(Oid *tableSpaces, int numSpaces); extern void SetTempTablespaces(Oid *tableSpaces, int numSpaces);
extern bool TempTablespacesAreSet(void); extern bool TempTablespacesAreSet(void);
extern int GetTempTablespaces(Oid *tableSpaces, int numSpaces);
extern Oid GetNextTempTableSpace(void); extern Oid GetNextTempTableSpace(void);
extern void AtEOXact_Files(void); extern void AtEOXact_Files(void);
extern void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, extern void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
...@@ -124,7 +133,7 @@ extern int durable_unlink(const char *fname, int loglevel); ...@@ -124,7 +133,7 @@ extern int durable_unlink(const char *fname, int loglevel);
extern int durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel); extern int durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel);
extern void SyncDataDirectory(void); extern void SyncDataDirectory(void);
/* Filename components for OpenTemporaryFile */ /* Filename components */
#define PG_TEMP_FILES_DIR "pgsql_tmp" #define PG_TEMP_FILES_DIR "pgsql_tmp"
#define PG_TEMP_FILE_PREFIX "pgsql_tmp" #define PG_TEMP_FILE_PREFIX "pgsql_tmp"
......
/*-------------------------------------------------------------------------
*
* sharedfileset.h
* Shared temporary file management.
*
*
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/storage/sharedfilespace.h
*
*-------------------------------------------------------------------------
*/
#ifndef SHAREDFILESET_H
#define SHAREDFILESET_H
#include "storage/dsm.h"
#include "storage/fd.h"
#include "storage/spin.h"
/*
* A set of temporary files that can be shared by multiple backends.
*/
typedef struct SharedFileSet
{
pid_t creator_pid; /* PID of the creating process */
uint32 number; /* per-PID identifier */
slock_t mutex; /* mutex protecting the reference count */
int refcnt; /* number of attached backends */
int ntablespaces; /* number of tablespaces to use */
Oid tablespaces[8]; /* OIDs of tablespaces to use. Assumes that
* it's rare that there more than temp
* tablespaces. */
} SharedFileSet;
extern void SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg);
extern void SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg);
extern File SharedFileSetCreate(SharedFileSet *fileset, const char *name);
extern File SharedFileSetOpen(SharedFileSet *fileset, const char *name);
extern bool SharedFileSetDelete(SharedFileSet *fileset, const char *name,
bool error_on_failure);
extern void SharedFileSetDeleteAll(SharedFileSet *fileset);
#endif
...@@ -2026,6 +2026,7 @@ SharedBitmapState ...@@ -2026,6 +2026,7 @@ SharedBitmapState
SharedDependencyObjectType SharedDependencyObjectType
SharedDependencyType SharedDependencyType
SharedExecutorInstrumentation SharedExecutorInstrumentation
SharedFileSet
SharedInvalCatalogMsg SharedInvalCatalogMsg
SharedInvalCatcacheMsg SharedInvalCatcacheMsg
SharedInvalRelcacheMsg SharedInvalRelcacheMsg
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment