Commit 851a26e2 authored by Tom Lane's avatar Tom Lane

While vacuuming a large table, update upper-level FSM data every so often.

VACUUM updates leaf-level FSM entries immediately after cleaning the
corresponding heap blocks.  fsmpage.c updates the intra-page search trees
on the leaf-level FSM pages when this happens, but it does not touch the
upper-level FSM pages, so that the released space might not actually be
findable by searchers.  Previously, updating the upper-level pages happened
only at the conclusion of the VACUUM run, in a single FreeSpaceMapVacuum()
call.  This is bad because the VACUUM might get canceled before ever
reaching that point, so that from the point of view of searchers no space
has been freed at all, leading to table bloat.

We can improve matters by updating the upper pages immediately after each
cycle of index-cleaning and heap-cleaning, processing just the FSM pages
corresponding to the range of heap blocks we have now fully cleaned.
This adds a small amount of extra work, since the FSM pages leading down
to each range boundary will be touched twice, but it's pretty negligible
compared to everything else going on in a large VACUUM.

If there are no indexes, VACUUM doesn't work in cycles but just cleans
each heap page on first visit.  In that case we just arbitrarily update
upper FSM pages after each 8GB of heap.  That maintains the goal of not
letting all this work slide until the very end, and it doesn't seem worth
expending extra complexity on a case that so seldom occurs in practice.

In either case, the FSM is fully up to date before any attempt is made
to truncate the relation, so that the most likely scenario for VACUUM
cancellation no longer results in out-of-date upper FSM pages.  When
we do successfully truncate, adjusting the FSM to reflect that is now
fully handled within FreeSpaceMapTruncateRel.

Claudio Freire, reviewed by Masahiko Sawada and Jing Wang, some additional
tweaks by me

Discussion: https://postgr.es/m/CAGTBQpYR0uJCNTt3M5GOzBRHo+-GccNO1nCaQ8yEJmZKSW5q1A@mail.gmail.com
parent c0cbe00f
...@@ -84,6 +84,15 @@ ...@@ -84,6 +84,15 @@
#define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */ #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */
#define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */ #define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */
/*
* When a table has no indexes, vacuum the FSM after every 8GB, approximately
* (it won't be exact because we only vacuum FSM after processing a heap page
* that has some removable tuples). When there are indexes, this is ignored,
* and we vacuum FSM after each index/heap cleaning pass.
*/
#define VACUUM_FSM_EVERY_PAGES \
((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ))
/* /*
* Guesstimation of number of dead tuples per page. This is used to * Guesstimation of number of dead tuples per page. This is used to
* provide an upper limit to memory allocated when vacuuming small * provide an upper limit to memory allocated when vacuuming small
...@@ -285,9 +294,6 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params, ...@@ -285,9 +294,6 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params,
pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
PROGRESS_VACUUM_PHASE_FINAL_CLEANUP); PROGRESS_VACUUM_PHASE_FINAL_CLEANUP);
/* Vacuum the Free Space Map */
FreeSpaceMapVacuum(onerel);
/* /*
* Update statistics in pg_class. * Update statistics in pg_class.
* *
...@@ -465,7 +471,8 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, ...@@ -465,7 +471,8 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
TransactionId relfrozenxid = onerel->rd_rel->relfrozenxid; TransactionId relfrozenxid = onerel->rd_rel->relfrozenxid;
TransactionId relminmxid = onerel->rd_rel->relminmxid; TransactionId relminmxid = onerel->rd_rel->relminmxid;
BlockNumber empty_pages, BlockNumber empty_pages,
vacuumed_pages; vacuumed_pages,
next_fsm_block_to_vacuum;
double num_tuples, /* total number of nonremovable tuples */ double num_tuples, /* total number of nonremovable tuples */
live_tuples, /* live tuples (reltuples estimate) */ live_tuples, /* live tuples (reltuples estimate) */
tups_vacuumed, /* tuples cleaned up by vacuum */ tups_vacuumed, /* tuples cleaned up by vacuum */
...@@ -501,6 +508,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, ...@@ -501,6 +508,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
relname))); relname)));
empty_pages = vacuumed_pages = 0; empty_pages = vacuumed_pages = 0;
next_fsm_block_to_vacuum = (BlockNumber) 0;
num_tuples = live_tuples = tups_vacuumed = nkeep = nunused = 0; num_tuples = live_tuples = tups_vacuumed = nkeep = nunused = 0;
indstats = (IndexBulkDeleteResult **) indstats = (IndexBulkDeleteResult **)
...@@ -752,6 +760,13 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, ...@@ -752,6 +760,13 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
vacrelstats->num_dead_tuples = 0; vacrelstats->num_dead_tuples = 0;
vacrelstats->num_index_scans++; vacrelstats->num_index_scans++;
/*
* Vacuum the Free Space Map to make newly-freed space visible on
* upper-level FSM pages. Note we have not yet processed blkno.
*/
FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum, blkno);
next_fsm_block_to_vacuum = blkno;
/* Report that we are once again scanning the heap */ /* Report that we are once again scanning the heap */
pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
PROGRESS_VACUUM_PHASE_SCAN_HEAP); PROGRESS_VACUUM_PHASE_SCAN_HEAP);
...@@ -1200,6 +1215,19 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, ...@@ -1200,6 +1215,19 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
*/ */
vacrelstats->num_dead_tuples = 0; vacrelstats->num_dead_tuples = 0;
vacuumed_pages++; vacuumed_pages++;
/*
* Periodically do incremental FSM vacuuming to make newly-freed
* space visible on upper FSM pages. Note: although we've cleaned
* the current block, we haven't yet updated its FSM entry (that
* happens further down), so passing end == blkno is correct.
*/
if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES)
{
FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum,
blkno);
next_fsm_block_to_vacuum = blkno;
}
} }
freespace = PageGetHeapFreeSpace(page); freespace = PageGetHeapFreeSpace(page);
...@@ -1368,6 +1396,13 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, ...@@ -1368,6 +1396,13 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
vacrelstats->num_index_scans++; vacrelstats->num_index_scans++;
} }
/*
* Vacuum the remainder of the Free Space Map. We must do this whether or
* not there were indexes.
*/
if (blkno > next_fsm_block_to_vacuum)
FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum, blkno);
/* report all blocks vacuumed; and that we're cleaning up */ /* report all blocks vacuumed; and that we're cleaning up */
pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno); pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
......
...@@ -180,13 +180,13 @@ have a corrupted page, with a parent somewhere with too small a value. ...@@ -180,13 +180,13 @@ have a corrupted page, with a parent somewhere with too small a value.
Secondly, if we detect corrupted pages while we search, traversing down Secondly, if we detect corrupted pages while we search, traversing down
the tree. That check will notice if a parent node is set to too high a value. the tree. That check will notice if a parent node is set to too high a value.
In both cases, the upper nodes on the page are immediately rebuilt, fixing In both cases, the upper nodes on the page are immediately rebuilt, fixing
the corruption. the corruption so far as that page is concerned.
Vacuum updates all the bottom level pages with correct amount of free space VACUUM updates all the bottom-level FSM pages with the correct amount of free
on the heap pages, fixing any outdated values there. After the heap and space on corresponding heap pages, as it proceeds through the heap. This
index passes are done, FreeSpaceMapVacuum is called, and the FSM tree is goes through fsm_set_avail(), so that the upper nodes on those pages are
scanned in depth-first order. This fixes any discrepancies between upper immediately updated. Periodically, VACUUM calls FreeSpaceMapVacuum[Range]
and lower level FSM pages. to propagate the new free-space info into the upper pages of the FSM tree.
TODO TODO
---- ----
......
...@@ -108,7 +108,9 @@ static Size fsm_space_cat_to_avail(uint8 cat); ...@@ -108,7 +108,9 @@ static Size fsm_space_cat_to_avail(uint8 cat);
static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
uint8 newValue, uint8 minValue); uint8 newValue, uint8 minValue);
static BlockNumber fsm_search(Relation rel, uint8 min_cat); static BlockNumber fsm_search(Relation rel, uint8 min_cat);
static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof); static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
BlockNumber start, BlockNumber end,
bool *eof);
static BlockNumber fsm_get_lastblckno(Relation rel, FSMAddress addr); static BlockNumber fsm_get_lastblckno(Relation rel, FSMAddress addr);
static void fsm_update_recursive(Relation rel, FSMAddress addr, uint8 new_cat); static void fsm_update_recursive(Relation rel, FSMAddress addr, uint8 new_cat);
...@@ -370,21 +372,48 @@ FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks) ...@@ -370,21 +372,48 @@ FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks)
*/ */
if (rel->rd_smgr) if (rel->rd_smgr)
rel->rd_smgr->smgr_fsm_nblocks = new_nfsmblocks; rel->rd_smgr->smgr_fsm_nblocks = new_nfsmblocks;
/*
* Update upper-level FSM pages to account for the truncation. This is
* important because the just-truncated pages were likely marked as
* all-free, and would be preferentially selected.
*/
FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);
} }
/* /*
* FreeSpaceMapVacuum - scan and fix any inconsistencies in the FSM * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM
*
* We assume that the bottom-level pages have already been updated with
* new free-space information.
*/ */
void void
FreeSpaceMapVacuum(Relation rel) FreeSpaceMapVacuum(Relation rel)
{ {
bool dummy; bool dummy;
/* /* Recursively scan the tree, starting at the root */
* Traverse the tree in depth-first order. The tree is stored physically (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS,
* in depth-first order, so this should be pretty I/O efficient. (BlockNumber) 0, InvalidBlockNumber,
*/ &dummy);
fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, &dummy); }
/*
* FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM
*
* As above, but assume that only heap pages between start and end-1 inclusive
* have new free-space information, so update only the upper-level slots
* covering that block range. end == InvalidBlockNumber is equivalent to
* "all the rest of the relation".
*/
void
FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
{
bool dummy;
/* Recursively scan the tree, starting at the root */
if (end > start)
(void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy);
} }
/******** Internal routines ********/ /******** Internal routines ********/
...@@ -783,9 +812,21 @@ fsm_search(Relation rel, uint8 min_cat) ...@@ -783,9 +812,21 @@ fsm_search(Relation rel, uint8 min_cat)
/* /*
* Recursive guts of FreeSpaceMapVacuum * Recursive guts of FreeSpaceMapVacuum
*
* Examine the FSM page indicated by addr, as well as its children, updating
* upper-level nodes that cover the heap block range from start to end-1.
* (It's okay if end is beyond the actual end of the map.)
* Return the maximum freespace value on this page.
*
* If addr is past the end of the FSM, set *eof_p to true and return 0.
*
* This traverses the tree in depth-first order. The tree is stored
* physically in depth-first order, so this should be pretty I/O efficient.
*/ */
static uint8 static uint8
fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p) fsm_vacuum_page(Relation rel, FSMAddress addr,
BlockNumber start, BlockNumber end,
bool *eof_p)
{ {
Buffer buf; Buffer buf;
Page page; Page page;
...@@ -804,15 +845,52 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p) ...@@ -804,15 +845,52 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p)
page = BufferGetPage(buf); page = BufferGetPage(buf);
/* /*
* Recurse into children, and fix the information stored about them at * If we're above the bottom level, recurse into children, and fix the
* this level. * information stored about them at this level.
*/ */
if (addr.level > FSM_BOTTOM_LEVEL) if (addr.level > FSM_BOTTOM_LEVEL)
{ {
int slot; FSMAddress fsm_start,
fsm_end;
uint16 fsm_start_slot,
fsm_end_slot;
int slot,
start_slot,
end_slot;
bool eof = false; bool eof = false;
for (slot = 0; slot < SlotsPerFSMPage; slot++) /*
* Compute the range of slots we need to update on this page, given
* the requested range of heap blocks to consider. The first slot to
* update is the one covering the "start" block, and the last slot is
* the one covering "end - 1". (Some of this work will be duplicated
* in each recursive call, but it's cheap enough to not worry about.)
*/
fsm_start = fsm_get_location(start, &fsm_start_slot);
fsm_end = fsm_get_location(end - 1, &fsm_end_slot);
while (fsm_start.level < addr.level)
{
fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot);
fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot);
}
Assert(fsm_start.level == addr.level);
if (fsm_start.logpageno == addr.logpageno)
start_slot = fsm_start_slot;
else if (fsm_start.logpageno > addr.logpageno)
start_slot = SlotsPerFSMPage; /* shouldn't get here... */
else
start_slot = 0;
if (fsm_end.logpageno == addr.logpageno)
end_slot = fsm_end_slot;
else if (fsm_end.logpageno > addr.logpageno)
end_slot = SlotsPerFSMPage - 1;
else
end_slot = -1; /* shouldn't get here... */
for (slot = start_slot; slot <= end_slot; slot++)
{ {
int child_avail; int child_avail;
...@@ -820,7 +898,9 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p) ...@@ -820,7 +898,9 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p)
/* After we hit end-of-file, just clear the rest of the slots */ /* After we hit end-of-file, just clear the rest of the slots */
if (!eof) if (!eof)
child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), &eof); child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot),
start, end,
&eof);
else else
child_avail = 0; child_avail = 0;
...@@ -835,6 +915,7 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p) ...@@ -835,6 +915,7 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p)
} }
} }
/* Now get the maximum value on the page, to return to caller */
max_avail = fsm_get_max_avail(BufferGetPage(buf)); max_avail = fsm_get_max_avail(BufferGetPage(buf));
/* /*
......
...@@ -32,6 +32,8 @@ extern void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, ...@@ -32,6 +32,8 @@ extern void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
extern void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks); extern void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks);
extern void FreeSpaceMapVacuum(Relation rel); extern void FreeSpaceMapVacuum(Relation rel);
extern void FreeSpaceMapVacuumRange(Relation rel, BlockNumber start,
BlockNumber end);
extern void UpdateFreeSpaceMap(Relation rel, extern void UpdateFreeSpaceMap(Relation rel,
BlockNumber startBlkNum, BlockNumber startBlkNum,
BlockNumber endBlkNum, BlockNumber endBlkNum,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment