Commit 3eb77eba authored by Thomas Munro's avatar Thomas Munro

Refactor the fsync queue for wider use.

Previously, md.c and checkpointer.c were tightly integrated so that
fsync calls could be handed off and processed in the background.
Introduce a system of callbacks and file tags, so that other modules
can hand off fsync work in the same way.

For now only md.c uses the new interface, but other users are being
proposed.  Since there may be use cases that are not strictly SMGR
implementations, use a new function table for sync handlers rather
than extending the traditional SMGR one.

Instead of using a bitmapset of segment numbers for each RelFileNode
in the checkpointer's hash table, make the segment number part of the
key.  This requires sending explicit "forget" requests for every
segment individually when relations are dropped, but suits the file
layout schemes of proposed future users better (ie sparse or high
segment numbers).

Author: Shawn Debnath and Thomas Munro
Reviewed-by: Thomas Munro, Andres Freund
Discussion: https://postgr.es/m/CAEepm=2gTANm=e3ARnJT=n0h8hf88wqmaZxk0JYkxw+b21fNrw@mail.gmail.com
parent 33215d11
......@@ -98,6 +98,7 @@
#include "replication/walsender.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/md.h"
#include "storage/predicate.h"
#include "storage/proc.h"
#include "storage/procarray.h"
......
......@@ -50,6 +50,7 @@
#include "storage/fd.h"
#include "storage/freespace.h"
#include "storage/lmgr.h"
#include "storage/md.h"
#include "storage/predicate.h"
#include "storage/proc.h"
#include "storage/procarray.h"
......
......@@ -66,6 +66,7 @@
#include "storage/reinit.h"
#include "storage/smgr.h"
#include "storage/spin.h"
#include "storage/sync.h"
#include "utils/builtins.h"
#include "utils/guc.h"
#include "utils/memutils.h"
......@@ -6981,7 +6982,7 @@ StartupXLOG(void)
if (ArchiveRecoveryRequested && IsUnderPostmaster)
{
PublishStartupProcessInformation();
SetForwardFsyncRequests();
EnableSyncRequestForwarding();
SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
bgwriterLaunched = true;
}
......@@ -8566,7 +8567,7 @@ CreateCheckPoint(int flags)
* the REDO pointer. Note that smgr must not do anything that'd have to
* be undone if we decide no checkpoint is needed.
*/
smgrpreckpt();
SyncPreCheckpoint();
/* Begin filling in the checkpoint WAL record */
MemSet(&checkPoint, 0, sizeof(checkPoint));
......@@ -8856,7 +8857,7 @@ CreateCheckPoint(int flags)
/*
* Let smgr do post-checkpoint cleanup (eg, deleting old files).
*/
smgrpostckpt();
SyncPostCheckpoint();
/*
* Update the average distance between checkpoints if the prior checkpoint
......
......@@ -54,6 +54,7 @@
#include "storage/fd.h"
#include "storage/lmgr.h"
#include "storage/ipc.h"
#include "storage/md.h"
#include "storage/procarray.h"
#include "storage/smgr.h"
#include "utils/acl.h"
......@@ -941,11 +942,11 @@ dropdb(const char *dbname, bool missing_ok)
* worse, it will delete files that belong to a newly created database
* with the same OID.
*/
ForgetDatabaseFsyncRequests(db_id);
ForgetDatabaseSyncRequests(db_id);
/*
* Force a checkpoint to make sure the checkpointer has received the
* message sent by ForgetDatabaseFsyncRequests. On Windows, this also
* message sent by ForgetDatabaseSyncRequests. On Windows, this also
* ensures that background procs don't hold any open files, which would
* cause rmdir() to fail.
*/
......@@ -2150,7 +2151,7 @@ dbase_redo(XLogReaderState *record)
DropDatabaseBuffers(xlrec->db_id);
/* Also, clean out any fsync requests that might be pending in md.c */
ForgetDatabaseFsyncRequests(xlrec->db_id);
ForgetDatabaseSyncRequests(xlrec->db_id);
/* Clean out the xlog relcache too */
XLogDropDatabase(xlrec->db_id);
......
......@@ -108,10 +108,8 @@
*/
typedef struct
{
RelFileNode rnode;
ForkNumber forknum;
BlockNumber segno; /* see md.c for special values */
/* might add a real request-type field later; not needed yet */
SyncRequestType type; /* request type */
FileTag ftag; /* file identifier */
} CheckpointerRequest;
typedef struct
......@@ -349,7 +347,7 @@ CheckpointerMain(void)
/*
* Process any requests or signals received recently.
*/
AbsorbFsyncRequests();
AbsorbSyncRequests();
if (got_SIGHUP)
{
......@@ -684,7 +682,7 @@ CheckpointWriteDelay(int flags, double progress)
UpdateSharedMemoryConfig();
}
AbsorbFsyncRequests();
AbsorbSyncRequests();
absorb_counter = WRITES_PER_ABSORB;
CheckArchiveTimeout();
......@@ -709,7 +707,7 @@ CheckpointWriteDelay(int flags, double progress)
* operations even when we don't sleep, to prevent overflow of the
* fsync request queue.
*/
AbsorbFsyncRequests();
AbsorbSyncRequests();
absorb_counter = WRITES_PER_ABSORB;
}
}
......@@ -1084,7 +1082,7 @@ RequestCheckpoint(int flags)
}
/*
* ForwardFsyncRequest
* ForwardSyncRequest
* Forward a file-fsync request from a backend to the checkpointer
*
* Whenever a backend is compelled to write directly to a relation
......@@ -1093,15 +1091,6 @@ RequestCheckpoint(int flags)
* is dirty and must be fsync'd before next checkpoint. We also use this
* opportunity to count such writes for statistical purposes.
*
* This functionality is only supported for regular (not backend-local)
* relations, so the rnode argument is intentionally RelFileNode not
* RelFileNodeBackend.
*
* segno specifies which segment (not block!) of the relation needs to be
* fsync'd. (Since the valid range is much less than BlockNumber, we can
* use high values for special flags; that's all internal to md.c, which
* see for details.)
*
* To avoid holding the lock for longer than necessary, we normally write
* to the requests[] queue without checking for duplicates. The checkpointer
* will have to eliminate dups internally anyway. However, if we discover
......@@ -1113,7 +1102,7 @@ RequestCheckpoint(int flags)
* let the backend know by returning false.
*/
bool
ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
{
CheckpointerRequest *request;
bool too_full;
......@@ -1122,7 +1111,7 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
return false; /* probably shouldn't even get here */
if (AmCheckpointerProcess())
elog(ERROR, "ForwardFsyncRequest must not be called in checkpointer");
elog(ERROR, "ForwardSyncRequest must not be called in checkpointer");
LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
......@@ -1151,9 +1140,8 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
/* OK, insert request */
request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++];
request->rnode = rnode;
request->forknum = forknum;
request->segno = segno;
request->ftag = *ftag;
request->type = type;
/* If queue is more than half full, nudge the checkpointer to empty it */
too_full = (CheckpointerShmem->num_requests >=
......@@ -1284,8 +1272,8 @@ CompactCheckpointerRequestQueue(void)
}
/*
* AbsorbFsyncRequests
* Retrieve queued fsync requests and pass them to local smgr.
* AbsorbSyncRequests
* Retrieve queued sync requests and pass them to sync mechanism.
*
* This is exported because it must be called during CreateCheckPoint;
* we have to be sure we have accepted all pending requests just before
......@@ -1293,7 +1281,7 @@ CompactCheckpointerRequestQueue(void)
* non-checkpointer processes, do nothing if not checkpointer.
*/
void
AbsorbFsyncRequests(void)
AbsorbSyncRequests(void)
{
CheckpointerRequest *requests = NULL;
CheckpointerRequest *request;
......@@ -1335,7 +1323,7 @@ AbsorbFsyncRequests(void)
LWLockRelease(CheckpointerCommLock);
for (request = requests; n > 0; request++, n--)
RememberFsyncRequest(request->rnode, request->forknum, request->segno);
RememberSyncRequest(&request->ftag, request->type);
END_CRIT_SECTION();
......
......@@ -8,6 +8,6 @@ subdir = src/backend/storage
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
SUBDIRS = buffer file freespace ipc large_object lmgr page smgr
SUBDIRS = buffer file freespace ipc large_object lmgr page smgr sync
include $(top_srcdir)/src/backend/common.mk
......@@ -2584,7 +2584,7 @@ CheckPointBuffers(int flags)
BufferSync(flags);
CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
smgrsync();
ProcessSyncRequests();
CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
}
......
This diff is collapsed.
......@@ -21,6 +21,7 @@
#include "lib/ilist.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/md.h"
#include "storage/smgr.h"
#include "utils/hsearch.h"
#include "utils/inval.h"
......@@ -60,12 +61,8 @@ typedef struct f_smgr
void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
void (*smgr_pre_ckpt) (void); /* may be NULL */
void (*smgr_sync) (void); /* may be NULL */
void (*smgr_post_ckpt) (void); /* may be NULL */
} f_smgr;
static const f_smgr smgrsw[] = {
/* magnetic disk */
{
......@@ -83,15 +80,11 @@ static const f_smgr smgrsw[] = {
.smgr_nblocks = mdnblocks,
.smgr_truncate = mdtruncate,
.smgr_immedsync = mdimmedsync,
.smgr_pre_ckpt = mdpreckpt,
.smgr_sync = mdsync,
.smgr_post_ckpt = mdpostckpt
}
};
static const int NSmgr = lengthof(smgrsw);
/*
* Each backend has a hashtable that stores all extant SMgrRelation objects.
* In addition, "unowned" SMgrRelation objects are chained together in a list.
......@@ -705,52 +698,6 @@ smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
}
/*
* smgrpreckpt() -- Prepare for checkpoint.
*/
void
smgrpreckpt(void)
{
int i;
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_pre_ckpt)
smgrsw[i].smgr_pre_ckpt();
}
}
/*
* smgrsync() -- Sync files to disk during checkpoint.
*/
void
smgrsync(void)
{
int i;
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_sync)
smgrsw[i].smgr_sync();
}
}
/*
* smgrpostckpt() -- Post-checkpoint cleanup.
*/
void
smgrpostckpt(void)
{
int i;
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_post_ckpt)
smgrsw[i].smgr_post_ckpt();
}
}
/*
* AtEOXact_SMgr
*
......
#-------------------------------------------------------------------------
#
# Makefile--
# Makefile for storage/sync
#
# IDENTIFICATION
# src/backend/storage/sync/Makefile
#
#-------------------------------------------------------------------------
subdir = src/backend/storage/sync
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
OBJS = sync.o
include $(top_srcdir)/src/backend/common.mk
This diff is collapsed.
......@@ -51,6 +51,7 @@
#include "storage/proc.h"
#include "storage/sinvaladt.h"
#include "storage/smgr.h"
#include "storage/sync.h"
#include "tcop/tcopprot.h"
#include "utils/acl.h"
#include "utils/fmgroids.h"
......@@ -557,6 +558,7 @@ BaseInit(void)
/* Do local initialization of file, storage and buffer managers */
InitFileAccess();
InitSync();
smgrinit();
InitBufferPoolAccess();
}
......
......@@ -17,6 +17,8 @@
#include "storage/block.h"
#include "storage/relfilenode.h"
#include "storage/smgr.h"
#include "storage/sync.h"
/* GUC options */
......@@ -31,9 +33,9 @@ extern void CheckpointerMain(void) pg_attribute_noreturn();
extern void RequestCheckpoint(int flags);
extern void CheckpointWriteDelay(int flags, double progress);
extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
BlockNumber segno);
extern void AbsorbFsyncRequests(void);
extern bool ForwardSyncRequest(const FileTag *ftag, SyncRequestType type);
extern void AbsorbSyncRequests(void);
extern Size CheckpointerShmemSize(void);
extern void CheckpointerShmemInit(void);
......
......@@ -54,6 +54,18 @@ extern PGDLLIMPORT bool data_sync_retry;
*/
extern int max_safe_fds;
/*
* On Windows, we have to interpret EACCES as possibly meaning the same as
* ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
* that's what you get. Ugh. This code is designed so that we don't
* actually believe these cases are okay without further evidence (namely,
* a pending fsync request getting canceled ... see ProcessSyncRequests).
*/
#ifndef WIN32
#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)
#else
#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES)
#endif
/*
* prototypes for functions in fd.c
......
/*-------------------------------------------------------------------------
*
* md.h
* magnetic disk storage manager public interface declarations.
*
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/storage/md.h
*
*-------------------------------------------------------------------------
*/
#ifndef MD_H
#define MD_H
#include "storage/block.h"
#include "storage/relfilenode.h"
#include "storage/smgr.h"
#include "storage/sync.h"
/* md storage manager functionality */
extern void mdinit(void);
extern void mdclose(SMgrRelation reln, ForkNumber forknum);
extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
extern void mdextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer);
extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks);
extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
extern void ForgetDatabaseSyncRequests(Oid dbid);
extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
/* md sync callbacks */
extern int mdsyncfiletag(const FileTag *ftag, char *path);
extern int mdunlinkfiletag(const FileTag *ftag, char *path);
extern bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate);
#endif /* MD_H */
......@@ -18,7 +18,6 @@
#include "storage/block.h"
#include "storage/relfilenode.h"
/*
* smgr.c maintains a table of SMgrRelation objects, which are essentially
* cached file handles. An SMgrRelation is created (if not already present)
......@@ -106,43 +105,6 @@ extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
extern void smgrpreckpt(void);
extern void smgrsync(void);
extern void smgrpostckpt(void);
extern void AtEOXact_SMgr(void);
/* internals: move me elsewhere -- ay 7/94 */
/* in md.c */
extern void mdinit(void);
extern void mdclose(SMgrRelation reln, ForkNumber forknum);
extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
extern void mdextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer);
extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks);
extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
extern void mdpreckpt(void);
extern void mdsync(void);
extern void mdpostckpt(void);
extern void SetForwardFsyncRequests(void);
extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum,
BlockNumber segno);
extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum);
extern void ForgetDatabaseFsyncRequests(Oid dbid);
extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
#endif /* SMGR_H */
/*-------------------------------------------------------------------------
*
* sync.h
* File synchronization management code.
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/storage/sync.h
*
*-------------------------------------------------------------------------
*/
#ifndef SYNC_H
#define SYNC_H
#include "storage/relfilenode.h"
/*
* Type of sync request. These are used to manage the set of pending
* requests to call a sync handler's sync or unlink functions at the next
* checkpoint.
*/
typedef enum SyncRequestType
{
SYNC_REQUEST, /* schedule a call of sync function */
SYNC_UNLINK_REQUEST, /* schedule a call of unlink function */
SYNC_FORGET_REQUEST, /* forget all calls for a tag */
SYNC_FILTER_REQUEST /* forget all calls satisfying match fn */
} SyncRequestType;
/*
* Which set of functions to use to handle a given request. See the function
* table in sync.c.
*/
typedef enum SyncRequestHandler
{
SYNC_HANDLER_MD = 0 /* md smgr */
} SyncRequestHandler;
/*
* A tag identifying a file. Currently it has the members required for md.c's
* usage, but sync.c has no knowledge of the internal structure, and it is
* liable to change as required by future handlers.
*/
typedef struct FileTag
{
int16 handler; /* SyncRequstHandler value, saving space */
int16 forknum; /* ForkNumber, saving space */
RelFileNode rnode;
uint32 segno;
} FileTag;
extern void InitSync(void);
extern void SyncPreCheckpoint(void);
extern void SyncPostCheckpoint(void);
extern void ProcessSyncRequests(void);
extern void RememberSyncRequest(const FileTag *ftag, SyncRequestType type);
extern void EnableSyncRequestForwarding(void);
extern bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
bool retryOnError);
#endif /* SYNC_H */
......@@ -651,6 +651,7 @@ File
FileFdwExecutionState
FileFdwPlanState
FileNameMap
FileTag
FindSplitData
FixedParallelExecutorState
FixedParallelState
......@@ -1700,7 +1701,7 @@ PathKeysComparison
PathTarget
Pattern_Prefix_Status
Pattern_Type
PendingOperationEntry
PendingFsyncEntry
PendingRelDelete
PendingUnlinkEntry
PendingWriteback
......@@ -2276,7 +2277,10 @@ Subscription
SubscriptionInfo
SubscriptionRelState
Syn
SyncOps
SyncRepConfigData
SyncRequestHandler
SyncRequestType
SysScanDesc
SyscacheCallbackFunction
SystemRowsSamplerData
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment