Commit be86e3dd authored by Tom Lane's avatar Tom Lane

Rethink checkpointer's fsync-request table representation.

Instead of having one hash table entry per relation/fork/segment, just have
one per relation, and use bitmapsets to represent which specific segments
need to be fsync'd.  This eliminates the need to scan the whole hash table
to implement FORGET_RELATION_FSYNC, which fixes the O(N^2) behavior
recently demonstrated by Jeff Janes for cases involving lots of TRUNCATE or
DROP TABLE operations during a single checkpoint cycle.  Per an idea from
Robert Haas.

(FORGET_DATABASE_FSYNC still sucks, but since dropping a database is a
pretty expensive operation anyway, we'll live with that.)

In passing, improve the delayed-unlink code: remove the pass over the list
in mdpreckpt, since it wasn't doing anything for us except supporting a
useless Assert in mdpostckpt, and fix mdpostckpt so that it will absorb
fsync requests every so often when clearing a large backlog of deletion
requests.
parent 3072b7ba
...@@ -32,8 +32,9 @@ ...@@ -32,8 +32,9 @@
#include "pg_trace.h" #include "pg_trace.h"
/* interval for calling AbsorbFsyncRequests in mdsync */ /* intervals for calling AbsorbFsyncRequests in mdsync and mdpostckpt */
#define FSYNCS_PER_ABSORB 10 #define FSYNCS_PER_ABSORB 10
#define UNLINKS_PER_ABSORB 10
/* /*
* Special values for the segno arg to RememberFsyncRequest. * Special values for the segno arg to RememberFsyncRequest.
...@@ -51,7 +52,7 @@ ...@@ -51,7 +52,7 @@
* ENOENT, because if a file is unlinked-but-not-yet-gone on that platform, * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
* that's what you get. Ugh. This code is designed so that we don't * that's what you get. Ugh. This code is designed so that we don't
* actually believe these cases are okay without further evidence (namely, * actually believe these cases are okay without further evidence (namely,
* a pending fsync request getting revoked ... see mdsync). * a pending fsync request getting canceled ... see mdsync).
*/ */
#ifndef WIN32 #ifndef WIN32
#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT) #define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)
...@@ -111,12 +112,12 @@ static MemoryContext MdCxt; /* context for all md.c allocations */ ...@@ -111,12 +112,12 @@ static MemoryContext MdCxt; /* context for all md.c allocations */
/* /*
* In some contexts (currently, standalone backends and the checkpointer process) * In some contexts (currently, standalone backends and the checkpointer)
* we keep track of pending fsync operations: we need to remember all relation * we keep track of pending fsync operations: we need to remember all relation
* segments that have been written since the last checkpoint, so that we can * segments that have been written since the last checkpoint, so that we can
* fsync them down to disk before completing the next checkpoint. This hash * fsync them down to disk before completing the next checkpoint. This hash
* table remembers the pending operations. We use a hash table mostly as * table remembers the pending operations. We use a hash table mostly as
* a convenient way of eliminating duplicate requests. * a convenient way of merging duplicate requests.
* *
* We use a similar mechanism to remember no-longer-needed files that can * We use a similar mechanism to remember no-longer-needed files that can
* be deleted after the next checkpoint, but we use a linked list instead of * be deleted after the next checkpoint, but we use a linked list instead of
...@@ -129,20 +130,16 @@ static MemoryContext MdCxt; /* context for all md.c allocations */ ...@@ -129,20 +130,16 @@ static MemoryContext MdCxt; /* context for all md.c allocations */
* (Regular backends do not track pending operations locally, but forward * (Regular backends do not track pending operations locally, but forward
* them to the checkpointer.) * them to the checkpointer.)
*/ */
typedef struct
{
RelFileNode rnode; /* the targeted relation */
ForkNumber forknum; /* which fork */
BlockNumber segno; /* which segment */
} PendingOperationTag;
typedef uint16 CycleCtr; /* can be any convenient integer size */ typedef uint16 CycleCtr; /* can be any convenient integer size */
typedef struct typedef struct
{ {
PendingOperationTag tag; /* hash table key (must be first!) */ RelFileNode rnode; /* hash table key (must be first!) */
bool canceled; /* T => request canceled, not yet removed */ CycleCtr cycle_ctr; /* mdsync_cycle_ctr of oldest request */
CycleCtr cycle_ctr; /* mdsync_cycle_ctr when request was made */ /* requests[f] has bit n set if we need to fsync segment n of fork f */
Bitmapset *requests[MAX_FORKNUM + 1];
/* canceled[f] is true if we canceled fsyncs for fork "recently" */
bool canceled[MAX_FORKNUM + 1];
} PendingOperationEntry; } PendingOperationEntry;
typedef struct typedef struct
...@@ -206,7 +203,7 @@ mdinit(void) ...@@ -206,7 +203,7 @@ mdinit(void)
HASHCTL hash_ctl; HASHCTL hash_ctl;
MemSet(&hash_ctl, 0, sizeof(hash_ctl)); MemSet(&hash_ctl, 0, sizeof(hash_ctl));
hash_ctl.keysize = sizeof(PendingOperationTag); hash_ctl.keysize = sizeof(RelFileNode);
hash_ctl.entrysize = sizeof(PendingOperationEntry); hash_ctl.entrysize = sizeof(PendingOperationEntry);
hash_ctl.hash = tag_hash; hash_ctl.hash = tag_hash;
hash_ctl.hcxt = MdCxt; hash_ctl.hcxt = MdCxt;
...@@ -227,10 +224,19 @@ mdinit(void) ...@@ -227,10 +224,19 @@ mdinit(void)
void void
SetForwardFsyncRequests(void) SetForwardFsyncRequests(void)
{ {
/* Perform any pending ops we may have queued up */ /* Perform any pending fsyncs we may have queued up, then drop table */
if (pendingOpsTable) if (pendingOpsTable)
{
mdsync(); mdsync();
hash_destroy(pendingOpsTable);
}
pendingOpsTable = NULL; pendingOpsTable = NULL;
/*
* We should not have any pending unlink requests, since mdunlink doesn't
* queue unlink requests when isRedo.
*/
Assert(pendingUnlinks == NIL);
} }
/* /*
...@@ -1046,8 +1052,11 @@ mdsync(void) ...@@ -1046,8 +1052,11 @@ mdsync(void)
hash_seq_init(&hstat, pendingOpsTable); hash_seq_init(&hstat, pendingOpsTable);
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
{ {
ForkNumber forknum;
/* /*
* If the entry is new then don't process it this time. Note that * If the entry is new then don't process it this time; it might
* contain multiple fsync-request bits, but they are all new. Note
* "continue" bypasses the hash-remove call at the bottom of the loop. * "continue" bypasses the hash-remove call at the bottom of the loop.
*/ */
if (entry->cycle_ctr == mdsync_cycle_ctr) if (entry->cycle_ctr == mdsync_cycle_ctr)
...@@ -1057,21 +1066,42 @@ mdsync(void) ...@@ -1057,21 +1066,42 @@ mdsync(void)
Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr); Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
/* /*
* If fsync is off then we don't have to bother opening the file at * Scan over the forks and segments represented by the entry.
* all. (We delay checking until this point so that changing fsync on *
* the fly behaves sensibly.) Also, if the entry is marked canceled, * The bitmap manipulations are slightly tricky, because we can call
* fall through to delete it. * AbsorbFsyncRequests() inside the loop and that could result in
*/ * bms_add_member() modifying and even re-palloc'ing the bitmapsets.
if (enableFsync && !entry->canceled) * This is okay because we unlink each bitmapset from the hashtable
* entry before scanning it. That means that any incoming fsync
* requests will be processed now if they reach the table before we
* begin to scan their fork.
*/
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{
Bitmapset *requests = entry->requests[forknum];
int segno;
entry->requests[forknum] = NULL;
entry->canceled[forknum] = false;
while ((segno = bms_first_member(requests)) >= 0)
{ {
int failures; int failures;
/* /*
* If in checkpointer, we want to absorb pending requests every so * If fsync is off then we don't have to bother opening the
* often to prevent overflow of the fsync request queue. It is * file at all. (We delay checking until this point so that
* unspecified whether newly-added entries will be visited by * changing fsync on the fly behaves sensibly.)
* hash_seq_search, but we don't care since we don't need to */
* process them anyway. if (!enableFsync)
continue;
/*
* If in checkpointer, we want to absorb pending requests
* every so often to prevent overflow of the fsync request
* queue. It is unspecified whether newly-added entries will
* be visited by hash_seq_search, but we don't care since we
* don't need to process them anyway.
*/ */
if (--absorb_counter <= 0) if (--absorb_counter <= 0)
{ {
...@@ -1080,47 +1110,43 @@ mdsync(void) ...@@ -1080,47 +1110,43 @@ mdsync(void)
} }
/* /*
* The fsync table could contain requests to fsync segments that * The fsync table could contain requests to fsync segments
* have been deleted (unlinked) by the time we get to them. Rather * that have been deleted (unlinked) by the time we get to
* than just hoping an ENOENT (or EACCES on Windows) error can be * them. Rather than just hoping an ENOENT (or EACCES on
* ignored, what we do on error is absorb pending requests and * Windows) error can be ignored, what we do on error is
* then retry. Since mdunlink() queues a "revoke" message before * absorb pending requests and then retry. Since mdunlink()
* actually unlinking, the fsync request is guaranteed to be * queues a "cancel" message before actually unlinking, the
* marked canceled after the absorb if it really was this case. * fsync request is guaranteed to be marked canceled after the
* DROP DATABASE likewise has to tell us to forget fsync requests * absorb if it really was this case. DROP DATABASE likewise
* before it starts deletions. * has to tell us to forget fsync requests before it starts
* deletions.
*/ */
for (failures = 0;; failures++) /* loop exits at "break" */ for (failures = 0;; failures++) /* loop exits at "break" */
{ {
SMgrRelation reln; SMgrRelation reln;
MdfdVec *seg; MdfdVec *seg;
char *path; char *path;
int save_errno;
/* /*
* Find or create an smgr hash entry for this relation. This * Find or create an smgr hash entry for this relation.
* may seem a bit unclean -- md calling smgr? But it's really * This may seem a bit unclean -- md calling smgr? But
* the best solution. It ensures that the open file reference * it's really the best solution. It ensures that the
* isn't permanently leaked if we get an error here. (You may * open file reference isn't permanently leaked if we get
* say "but an unreferenced SMgrRelation is still a leak!" Not * an error here. (You may say "but an unreferenced
* really, because the only case in which a checkpoint is done * SMgrRelation is still a leak!" Not really, because the
* by a process that isn't about to shut down is in the * only case in which a checkpoint is done by a process
* checkpointer, and it will periodically do smgrcloseall(). * that isn't about to shut down is in the checkpointer,
* This fact justifies our not closing the reln in the success * and it will periodically do smgrcloseall(). This fact
* path either, which is a good thing since in * justifies our not closing the reln in the success path
* non-checkpointer cases we couldn't safely do that.) * either, which is a good thing since in non-checkpointer
*/ * cases we couldn't safely do that.)
reln = smgropen(entry->tag.rnode, InvalidBackendId); */
reln = smgropen(entry->rnode, InvalidBackendId);
/*
* It is possible that the relation has been dropped or /* Attempt to open and fsync the target segment */
* truncated since the fsync request was entered. Therefore, seg = _mdfd_getseg(reln, forknum,
* allow ENOENT, but only if we didn't fail already on this (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
* file. This applies both during _mdfd_getseg() and during
* FileSync, since fd.c might have closed the file behind our
* back.
*/
seg = _mdfd_getseg(reln, entry->tag.forknum,
entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
false, EXTENSION_RETURN_NULL); false, EXTENSION_RETURN_NULL);
INSTR_TIME_SET_CURRENT(sync_start); INSTR_TIME_SET_CURRENT(sync_start);
...@@ -1128,6 +1154,7 @@ mdsync(void) ...@@ -1128,6 +1154,7 @@ mdsync(void)
if (seg != NULL && if (seg != NULL &&
FileSync(seg->mdfd_vfd) >= 0) FileSync(seg->mdfd_vfd) >= 0)
{ {
/* Success; update statistics about sync timing */
INSTR_TIME_SET_CURRENT(sync_end); INSTR_TIME_SET_CURRENT(sync_end);
sync_diff = sync_end; sync_diff = sync_end;
INSTR_TIME_SUBTRACT(sync_diff, sync_start); INSTR_TIME_SUBTRACT(sync_diff, sync_start);
...@@ -1138,23 +1165,36 @@ mdsync(void) ...@@ -1138,23 +1165,36 @@ mdsync(void)
processed++; processed++;
if (log_checkpoints) if (log_checkpoints)
elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec", elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
processed, FilePathName(seg->mdfd_vfd), (double) elapsed / 1000); processed,
FilePathName(seg->mdfd_vfd),
(double) elapsed / 1000);
break; /* success; break out of retry loop */ break; /* out of retry loop */
} }
/* Compute file name for use in message */
save_errno = errno;
path = _mdfd_segpath(reln, forknum, (BlockNumber) segno);
errno = save_errno;
/* /*
* It is possible that the relation has been dropped or
* truncated since the fsync request was entered.
* Therefore, allow ENOENT, but only if we didn't fail
* already on this file. This applies both for
* _mdfd_getseg() and for FileSync, since fd.c might have
* closed the file behind our back.
*
* XXX is there any point in allowing more than one retry? * XXX is there any point in allowing more than one retry?
* Don't see one at the moment, but easy to change the test * Don't see one at the moment, but easy to change the
* here if so. * test here if so.
*/ */
path = _mdfd_segpath(reln, entry->tag.forknum,
entry->tag.segno);
if (!FILE_POSSIBLY_DELETED(errno) || if (!FILE_POSSIBLY_DELETED(errno) ||
failures > 0) failures > 0)
ereport(ERROR, ereport(ERROR,
(errcode_for_file_access(), (errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", path))); errmsg("could not fsync file \"%s\": %m",
path)));
else else
ereport(DEBUG1, ereport(DEBUG1,
(errcode_for_file_access(), (errcode_for_file_access(),
...@@ -1163,24 +1203,39 @@ mdsync(void) ...@@ -1163,24 +1203,39 @@ mdsync(void)
pfree(path); pfree(path);
/* /*
* Absorb incoming requests and check to see if canceled. * Absorb incoming requests and check to see if a cancel
* arrived for this relation fork.
*/ */
AbsorbFsyncRequests(); AbsorbFsyncRequests();
absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
if (entry->canceled) if (entry->canceled[forknum])
break; break;
} /* end retry loop */ } /* end retry loop */
} }
bms_free(requests);
}
/* /*
* If we get here, either we fsync'd successfully, or we don't have to * We've finished everything that was requested before we started to
* because enableFsync is off, or the entry is (now) marked canceled. * scan the entry. If no new requests have been inserted meanwhile,
* Okay to delete it. * remove the entry. Otherwise, update its cycle counter, as all the
* requests now in it must have arrived during this cycle.
*/ */
if (hash_search(pendingOpsTable, &entry->tag, for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{
if (entry->requests[forknum] != NULL)
break;
}
if (forknum <= MAX_FORKNUM)
entry->cycle_ctr = mdsync_cycle_ctr;
else
{
/* Okay to remove it */
if (hash_search(pendingOpsTable, &entry->rnode,
HASH_REMOVE, NULL) == NULL) HASH_REMOVE, NULL) == NULL)
elog(ERROR, "pendingOpsTable corrupted"); elog(ERROR, "pendingOpsTable corrupted");
}
} /* end loop over hashtable entries */ } /* end loop over hashtable entries */
/* Return sync performance metrics for report at checkpoint end */ /* Return sync performance metrics for report at checkpoint end */
...@@ -1209,21 +1264,6 @@ mdsync(void) ...@@ -1209,21 +1264,6 @@ mdsync(void)
void void
mdpreckpt(void) mdpreckpt(void)
{ {
ListCell *cell;
/*
* In case the prior checkpoint wasn't completed, stamp all entries in the
* list with the current cycle counter. Anything that's in the list at
* the start of checkpoint can surely be deleted after the checkpoint is
* finished, regardless of when the request was made.
*/
foreach(cell, pendingUnlinks)
{
PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
entry->cycle_ctr = mdckpt_cycle_ctr;
}
/* /*
* Any unlink requests arriving after this point will be assigned the next * Any unlink requests arriving after this point will be assigned the next
* cycle counter, and won't be unlinked until next checkpoint. * cycle counter, and won't be unlinked until next checkpoint.
...@@ -1239,6 +1279,9 @@ mdpreckpt(void) ...@@ -1239,6 +1279,9 @@ mdpreckpt(void)
void void
mdpostckpt(void) mdpostckpt(void)
{ {
int absorb_counter;
absorb_counter = UNLINKS_PER_ABSORB;
while (pendingUnlinks != NIL) while (pendingUnlinks != NIL)
{ {
PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks); PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
...@@ -1247,13 +1290,15 @@ mdpostckpt(void) ...@@ -1247,13 +1290,15 @@ mdpostckpt(void)
/* /*
* New entries are appended to the end, so if the entry is new we've * New entries are appended to the end, so if the entry is new we've
* reached the end of old entries. * reached the end of old entries.
*
* Note: if just the right number of consecutive checkpoints fail, we
* could be fooled here by cycle_ctr wraparound. However, the only
* consequence is that we'd delay unlinking for one more checkpoint,
* which is perfectly tolerable.
*/ */
if (entry->cycle_ctr == mdckpt_cycle_ctr) if (entry->cycle_ctr == mdckpt_cycle_ctr)
break; break;
/* Else assert we haven't missed it */
Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
/* Unlink the file */ /* Unlink the file */
path = relpathperm(entry->rnode, MAIN_FORKNUM); path = relpathperm(entry->rnode, MAIN_FORKNUM);
if (unlink(path) < 0) if (unlink(path) < 0)
...@@ -1272,8 +1317,21 @@ mdpostckpt(void) ...@@ -1272,8 +1317,21 @@ mdpostckpt(void)
} }
pfree(path); pfree(path);
/* And remove the list entry */
pendingUnlinks = list_delete_first(pendingUnlinks); pendingUnlinks = list_delete_first(pendingUnlinks);
pfree(entry); pfree(entry);
/*
* As in mdsync, we don't want to stop absorbing fsync requests for a
* long time when there are many deletions to be done. We can safely
* call AbsorbFsyncRequests() at this point in the loop (note it might
* try to delete list entries).
*/
if (--absorb_counter <= 0)
{
AbsorbFsyncRequests();
absorb_counter = UNLINKS_PER_ABSORB;
}
} }
} }
...@@ -1353,7 +1411,7 @@ register_unlink(RelFileNodeBackend rnode) ...@@ -1353,7 +1411,7 @@ register_unlink(RelFileNodeBackend rnode)
/* /*
* RememberFsyncRequest() -- callback from checkpointer side of fsync request * RememberFsyncRequest() -- callback from checkpointer side of fsync request
* *
* We stuff most fsync requests into the local hash table for execution * We stuff fsync requests into the local hash table for execution
* during the checkpointer's next checkpoint. UNLINK requests go into a * during the checkpointer's next checkpoint. UNLINK requests go into a
* separate linked list, however, because they get processed separately. * separate linked list, however, because they get processed separately.
* *
...@@ -1365,10 +1423,11 @@ register_unlink(RelFileNodeBackend rnode) ...@@ -1365,10 +1423,11 @@ register_unlink(RelFileNodeBackend rnode)
* - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
* - UNLINK_RELATION_REQUEST is a request to delete the file after the next * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
* checkpoint. * checkpoint.
* Note also that we're assuming real segment numbers don't exceed INT_MAX.
* *
* (Handling the FORGET_* requests is a tad slow because the hash table has * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash
* to be searched linearly, but it doesn't seem worth rethinking the table * table has to be searched linearly, but dropping a database is a pretty
* structure for them.) * heavyweight operation anyhow, so we'll live with it.)
*/ */
void void
RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
...@@ -1378,18 +1437,37 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) ...@@ -1378,18 +1437,37 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
if (segno == FORGET_RELATION_FSYNC) if (segno == FORGET_RELATION_FSYNC)
{ {
/* Remove any pending requests for the relation (one or all forks) */ /* Remove any pending requests for the relation (one or all forks) */
HASH_SEQ_STATUS hstat;
PendingOperationEntry *entry; PendingOperationEntry *entry;
hash_seq_init(&hstat, pendingOpsTable); entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) &rnode,
HASH_FIND,
NULL);
if (entry)
{
/*
* We can't just delete the entry since mdsync could have an
* active hashtable scan. Instead we delete the bitmapsets; this
* is safe because of the way mdsync is coded. We also set the
* "canceled" flags so that mdsync can tell that a cancel arrived
* for the fork(s).
*/
if (forknum == InvalidForkNumber)
{
/* remove requests for all forks */
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{ {
if (RelFileNodeEquals(entry->tag.rnode, rnode) && bms_free(entry->requests[forknum]);
(entry->tag.forknum == forknum || entry->requests[forknum] = NULL;
forknum == InvalidForkNumber)) entry->canceled[forknum] = true;
}
}
else
{ {
/* Okay, cancel this entry */ /* remove requests for single fork */
entry->canceled = true; bms_free(entry->requests[forknum]);
entry->requests[forknum] = NULL;
entry->canceled[forknum] = true;
} }
} }
} }
...@@ -1406,10 +1484,15 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) ...@@ -1406,10 +1484,15 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
hash_seq_init(&hstat, pendingOpsTable); hash_seq_init(&hstat, pendingOpsTable);
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
{ {
if (entry->tag.rnode.dbNode == rnode.dbNode) if (entry->rnode.dbNode == rnode.dbNode)
{
/* remove requests for all forks */
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{ {
/* Okay, cancel this entry */ bms_free(entry->requests[forknum]);
entry->canceled = true; entry->requests[forknum] = NULL;
entry->canceled[forknum] = true;
}
} }
} }
...@@ -1449,40 +1532,32 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) ...@@ -1449,40 +1532,32 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
else else
{ {
/* Normal case: enter a request to fsync this segment */ /* Normal case: enter a request to fsync this segment */
PendingOperationTag key; MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
PendingOperationEntry *entry; PendingOperationEntry *entry;
bool found; bool found;
/* ensure any pad bytes in the hash key are zeroed */
MemSet(&key, 0, sizeof(key));
key.rnode = rnode;
key.forknum = forknum;
key.segno = segno;
entry = (PendingOperationEntry *) hash_search(pendingOpsTable, entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
&key, &rnode,
HASH_ENTER, HASH_ENTER,
&found); &found);
/* if new or previously canceled entry, initialize it */ /* if new entry, initialize it */
if (!found || entry->canceled) if (!found)
{ {
entry->canceled = false;
entry->cycle_ctr = mdsync_cycle_ctr; entry->cycle_ctr = mdsync_cycle_ctr;
MemSet(entry->requests, 0, sizeof(entry->requests));
MemSet(entry->canceled, 0, sizeof(entry->canceled));
} }
/* /*
* NB: it's intentional that we don't change cycle_ctr if the entry * NB: it's intentional that we don't change cycle_ctr if the entry
* already exists. The fsync request must be treated as old, even * already exists. The cycle_ctr must represent the oldest fsync
* though the new request will be satisfied too by any subsequent * request that could be in the entry.
* fsync.
*
* However, if the entry is present but is marked canceled, we should
* act just as though it wasn't there. The only case where this could
* happen would be if a file had been deleted, we received but did not
* yet act on the cancel request, and the same relfilenode was then
* assigned to a new file. We mustn't lose the new request, but it
* should be considered new not old.
*/ */
entry->requests[forknum] = bms_add_member(entry->requests[forknum],
(int) segno);
MemoryContextSwitchTo(oldcxt);
} }
} }
...@@ -1503,7 +1578,7 @@ ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum) ...@@ -1503,7 +1578,7 @@ ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
else if (IsUnderPostmaster) else if (IsUnderPostmaster)
{ {
/* /*
* Notify the checkpointer about it. If we fail to queue the revoke * Notify the checkpointer about it. If we fail to queue the cancel
* message, we have to sleep and try again ... ugly, but hopefully * message, we have to sleep and try again ... ugly, but hopefully
* won't happen often. * won't happen often.
* *
...@@ -1517,7 +1592,7 @@ ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum) ...@@ -1517,7 +1592,7 @@ ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
/* /*
* Note we don't wait for the checkpointer to actually absorb the * Note we don't wait for the checkpointer to actually absorb the
* revoke message; see mdsync() for the implications. * cancel message; see mdsync() for the implications.
*/ */
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment