Commit 391eb5e5 authored by Tom Lane's avatar Tom Lane

Reimplement free-space-map management as per recent discussions.

Adjustable threshold is gone in favor of keeping track of total requested
page storage and doling out proportional fractions to each relation
(with a minimum amount per relation, and some quantization of the results
to avoid thrashing with small changes in page counts).  Provide special-
case code for indexes so as not to waste space storing useless page
free space counts.  Restructure internal data storage to be a flat array
instead of list-of-chunks; this may cost a little more work in data
copying when reorganizing, but allows binary search to be used during
lookup_fsm_page_entry().
parent a455c942
<!--
$Header: /cvsroot/pgsql/doc/src/sgml/runtime.sgml,v 1.169 2003/02/19 04:06:28 momjian Exp $
$Header: /cvsroot/pgsql/doc/src/sgml/runtime.sgml,v 1.170 2003/03/04 21:51:19 tgl Exp $
-->
<Chapter Id="runtime">
......@@ -1725,7 +1725,9 @@ dynamic_library_path = '/usr/local/lib/postgresql:/home/my_project/lib:$libdir'
<listitem>
<para>
Sets the maximum number of disk pages for which free space will
be tracked in the shared free-space map. The default is 10000.
be tracked in the shared free-space map. Six bytes of shared memory
are consumed for each page slot. This setting must be more than
16 * <varname>max_fsm_relations</varname>. The default is 20000.
This option can only be set at server start.
</para>
</listitem>
......@@ -1735,9 +1737,11 @@ dynamic_library_path = '/usr/local/lib/postgresql:/home/my_project/lib:$libdir'
<term><varname>MAX_FSM_RELATIONS</varname> (<type>integer</type>)</term>
<listitem>
<para>
Sets the maximum number of relations (tables) for which free
space will be tracked in the shared free-space map. The default
is 1000. This option can only be set at server start.
Sets the maximum number of relations (tables and indexes) for which
free space will be tracked in the shared free-space map. Roughly
fifty bytes of shared memory are consumed for each slot.
The default is 1000.
This option can only be set at server start.
</para>
</listitem>
</varlistentry>
......
......@@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.63 2003/02/23 23:20:52 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.64 2003/03/04 21:51:20 tgl Exp $
*
* NOTES
* Postgres btree pages look like ordinary relation pages. The opaque
......@@ -401,15 +401,10 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
* that the page is still free. (For example, an already-free page
* could have been re-used between the time the last VACUUM scanned
* it and the time the VACUUM made its FSM updates.)
*
* The request size should be more than half of what btvacuumcleanup
* logs as the per-page free space. We use BLCKSZ/2 and BLCKSZ-1
* to try to get some use out of FSM's space management algorithm.
* XXX this needs some more thought...
*/
for (;;)
{
blkno = GetPageWithFreeSpace(&rel->rd_node, BLCKSZ/2);
blkno = GetFreeIndexPage(&rel->rd_node);
if (blkno == InvalidBlockNumber)
break;
buf = ReadBuffer(rel, blkno);
......
......@@ -12,7 +12,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.100 2003/02/24 00:57:17 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.101 2003/03/04 21:51:20 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -697,7 +697,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(2);
BlockNumber num_pages;
BlockNumber blkno;
PageFreeSpaceInfo *pageSpaces;
BlockNumber *freePages;
int nFreePages,
maxFreePages;
BlockNumber pages_deleted = 0;
......@@ -712,7 +712,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
maxFreePages = MaxFSMPages;
if ((BlockNumber) maxFreePages > num_pages)
maxFreePages = (int) num_pages + 1; /* +1 to avoid palloc(0) */
pageSpaces = (PageFreeSpaceInfo *) palloc(maxFreePages * sizeof(PageFreeSpaceInfo));
freePages = (BlockNumber *) palloc(maxFreePages * sizeof(BlockNumber));
nFreePages = 0;
/* Create a temporary memory context to run _bt_pagedel in */
......@@ -740,12 +740,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
{
/* Okay to recycle this page */
if (nFreePages < maxFreePages)
{
pageSpaces[nFreePages].blkno = blkno;
/* claimed avail-space must be < BLCKSZ */
pageSpaces[nFreePages].avail = BLCKSZ-1;
nFreePages++;
}
freePages[nFreePages++] = blkno;
pages_deleted++;
}
else if (P_ISDELETED(opaque))
......@@ -781,12 +776,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
if (ndel && info->vacuum_full)
{
if (nFreePages < maxFreePages)
{
pageSpaces[nFreePages].blkno = blkno;
/* claimed avail-space must be < BLCKSZ */
pageSpaces[nFreePages].avail = BLCKSZ-1;
nFreePages++;
}
freePages[nFreePages++] = blkno;
}
MemoryContextSwitchTo(oldcontext);
......@@ -805,8 +795,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
{
BlockNumber new_pages = num_pages;
while (nFreePages > 0 &&
pageSpaces[nFreePages-1].blkno == new_pages-1)
while (nFreePages > 0 && freePages[nFreePages-1] == new_pages-1)
{
new_pages--;
pages_deleted--;
......@@ -841,12 +830,12 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
/*
* Update the shared Free Space Map with the info we now have about
* free space in the index, discarding any old info the map may have.
* free pages in the index, discarding any old info the map may have.
* We do not need to sort the page numbers; they're in order already.
*/
MultiRecordFreeSpace(&rel->rd_node, 0, nFreePages, pageSpaces);
RecordIndexFreeSpace(&rel->rd_node, nFreePages, freePages);
pfree(pageSpaces);
pfree(freePages);
MemoryContextDelete(mycontext);
......
......@@ -13,7 +13,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.250 2003/02/24 00:57:17 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.251 2003/03/04 21:51:20 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -336,6 +336,13 @@ vacuum(VacuumStmt *vacstmt)
*/
StartTransactionCommand(true);
/*
* If it was a database-wide VACUUM, print FSM usage statistics
* (we don't make you be superuser to see these).
*/
if (vacstmt->relation == NULL)
PrintFreeSpaceMapStatistics(elevel);
/*
* If we completed a database-wide VACUUM without skipping any
* relations, update the database's pg_database row with info
......@@ -2781,31 +2788,48 @@ vac_update_fsm(Relation onerel, VacPageList fraged_pages,
BlockNumber rel_pages)
{
int nPages = fraged_pages->num_pages;
int i;
VacPage *pagedesc = fraged_pages->pagedesc;
Size threshold;
PageFreeSpaceInfo *pageSpaces;
int outPages;
int i;
/*
* We only report pages with free space at least equal to the average
* request size --- this avoids cluttering FSM with uselessly-small bits
* of space. Although FSM would discard pages with little free space
* anyway, it's important to do this prefiltering because (a) it reduces
* the time spent holding the FSM lock in RecordRelationFreeSpace, and
* (b) FSM uses the number of pages reported as a statistic for guiding
* space management. If we didn't threshold our reports the same way
* vacuumlazy.c does, we'd be skewing that statistic.
*/
threshold = GetAvgFSMRequestSize(&onerel->rd_node);
/* +1 to avoid palloc(0) */
pageSpaces = (PageFreeSpaceInfo *)
palloc((nPages + 1) * sizeof(PageFreeSpaceInfo));
outPages = 0;
for (i = 0; i < nPages; i++)
{
pageSpaces[i].blkno = fraged_pages->pagedesc[i]->blkno;
pageSpaces[i].avail = fraged_pages->pagedesc[i]->free;
/*
* fraged_pages may contain entries for pages that we later
* decided to truncate from the relation; don't enter them into
* the free space map!
*/
if (pageSpaces[i].blkno >= rel_pages)
{
nPages = i;
if (pagedesc[i]->blkno >= rel_pages)
break;
if (pagedesc[i]->free >= threshold)
{
pageSpaces[outPages].blkno = pagedesc[i]->blkno;
pageSpaces[outPages].avail = pagedesc[i]->free;
outPages++;
}
}
MultiRecordFreeSpace(&onerel->rd_node, 0, nPages, pageSpaces);
RecordRelationFreeSpace(&onerel->rd_node, outPages, pageSpaces);
pfree(pageSpaces);
}
......
......@@ -31,7 +31,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/commands/vacuumlazy.c,v 1.26 2003/02/24 00:57:17 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/commands/vacuumlazy.c,v 1.27 2003/03/04 21:51:21 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -51,21 +51,11 @@
/*
* Space/time tradeoff parameters: do these need to be user-tunable?
*
* A page with less than PAGE_SPACE_THRESHOLD free space will be forgotten
* immediately, and not even passed to the free space map. Removing the
* uselessly small entries early saves cycles, and in particular reduces
* the amount of time we spend holding the FSM lock when we finally call
* MultiRecordFreeSpace. Since the FSM will ignore pages below its own
* runtime threshold anyway, there's no point in making this really small.
* XXX Is it worth trying to measure average tuple size, and using that to
* set the threshold? Problem is we don't know average tuple size very
* accurately for the first few pages...
*
* To consider truncating the relation, we want there to be at least
* relsize / REL_TRUNCATE_FRACTION potentially-freeable pages.
* REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
* is less) potentially-freeable pages.
*/
#define PAGE_SPACE_THRESHOLD ((Size) (BLCKSZ / 32))
#define REL_TRUNCATE_MINIMUM 1000
#define REL_TRUNCATE_FRACTION 16
/* MAX_TUPLES_PER_PAGE can be a conservative upper limit */
......@@ -78,6 +68,7 @@ typedef struct LVRelStats
BlockNumber rel_pages;
double rel_tuples;
BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
Size threshold; /* minimum interesting free space */
/* List of TIDs of tuples we intend to delete */
/* NB: this list is ordered by TID address */
int num_dead_tuples; /* current # of entries */
......@@ -149,6 +140,10 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
/* Set threshold for interesting free space = average request size */
/* XXX should we scale it up or down? Adjust vacuum.c too, if so */
vacrelstats->threshold = GetAvgFSMRequestSize(&onerel->rd_node);
/* Open all indexes of the relation */
vac_open_indexes(onerel, &nindexes, &Irel);
hasindex = (nindexes > 0);
......@@ -166,7 +161,8 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
* number of pages. Otherwise, the time taken isn't worth it.
*/
possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
if (possibly_freeable > vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)
if (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)
lazy_truncate_heap(onerel, vacrelstats);
/* Update shared free space map with final free space info */
......@@ -943,8 +939,21 @@ lazy_record_free_space(LVRelStats *vacrelstats,
PageFreeSpaceInfo *pageSpaces;
int n;
/* Ignore pages with little free space */
if (avail < PAGE_SPACE_THRESHOLD)
/*
* A page with less than stats->threshold free space will be forgotten
* immediately, and never passed to the free space map. Removing the
* uselessly small entries early saves cycles, and in particular reduces
* the amount of time we spend holding the FSM lock when we finally call
* RecordRelationFreeSpace. Since the FSM will probably drop pages with
* little free space anyway, there's no point in making this really small.
*
* XXX Is it worth trying to measure average tuple size, and using that to
* adjust the threshold? Would be worthwhile if FSM has no stats yet
* for this relation. But changing the threshold as we scan the rel
* might lead to bizarre behavior, too. Also, it's probably better if
* vacuum.c has the same thresholding behavior as we do here.
*/
if (avail < vacrelstats->threshold)
return;
/* Copy pointers to local variables for notational simplicity */
......@@ -1079,13 +1088,13 @@ lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats)
int nPages = vacrelstats->num_free_pages;
/*
* Sort data into order, as required by MultiRecordFreeSpace.
* Sort data into order, as required by RecordRelationFreeSpace.
*/
if (nPages > 1)
qsort(pageSpaces, nPages, sizeof(PageFreeSpaceInfo),
vac_cmp_page_spaces);
MultiRecordFreeSpace(&onerel->rd_node, 0, nPages, pageSpaces);
RecordRelationFreeSpace(&onerel->rd_node, nPages, pageSpaces);
}
/*
......
This diff is collapsed.
......@@ -11,7 +11,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.61 2002/09/20 19:56:01 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.62 2003/03/04 21:51:21 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -410,7 +410,7 @@ smgrtruncate(int16 which, Relation reln, BlockNumber nblocks)
* for the about-to-be-deleted blocks. We want to be sure it
* won't return bogus block numbers later on.
*/
MultiRecordFreeSpace(&reln->rd_node, nblocks, 0, NULL);
FreeSpaceMapTruncateRel(&reln->rd_node, nblocks);
newblks = (*(smgrsw[which].smgr_truncate)) (reln, nblocks);
if (newblks == InvalidBlockNumber)
......
......@@ -5,7 +5,7 @@
* command, configuration file, and command line options.
* See src/backend/utils/misc/README for more information.
*
* $Header: /cvsroot/pgsql/src/backend/utils/misc/guc.c,v 1.115 2003/02/23 23:27:21 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/utils/misc/guc.c,v 1.116 2003/03/04 21:51:21 tgl Exp $
*
* Copyright 2000 by PostgreSQL Global Development Group
* Written by Peter Eisentraut <peter_e@gmx.net>.
......@@ -644,11 +644,11 @@ static struct config_int
{
{"max_fsm_relations", PGC_POSTMASTER}, &MaxFSMRelations,
1000, 10, INT_MAX, NULL, NULL
1000, 100, INT_MAX, NULL, NULL
},
{
{"max_fsm_pages", PGC_POSTMASTER}, &MaxFSMPages,
10000, 1000, INT_MAX, NULL, NULL
20000, 1000, INT_MAX, NULL, NULL
},
{
......
......@@ -48,10 +48,11 @@
# Shared Memory Size
#
#shared_buffers = 64 # min max_connections*2 or 16, 8KB each
#max_fsm_relations = 1000 # min 10, fsm is free space map, ~40 bytes
#max_fsm_pages = 10000 # min 1000, fsm is free space map, ~6 bytes
#max_locks_per_transaction = 64 # min 10
#wal_buffers = 8 # min 4, typically 8KB each
# fsm = free space map
#max_fsm_relations = 1000 # min 100, ~50 bytes each
#max_fsm_pages = 20000 # min max_fsm_relations*16, 6 bytes each
#
# Non-shared Memory Sizes
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: freespace.h,v 1.8 2002/09/20 19:56:01 tgl Exp $
* $Id: freespace.h,v 1.9 2003/03/04 21:51:22 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -28,6 +28,7 @@ typedef struct PageFreeSpaceInfo
} PageFreeSpaceInfo;
/* GUC variables */
extern int MaxFSMRelations;
extern int MaxFSMPages;
......@@ -39,19 +40,26 @@ extern void InitFreeSpaceMap(void);
extern int FreeSpaceShmemSize(void);
extern BlockNumber GetPageWithFreeSpace(RelFileNode *rel, Size spaceNeeded);
extern void RecordFreeSpace(RelFileNode *rel, BlockNumber page,
Size spaceAvail);
extern BlockNumber RecordAndGetPageWithFreeSpace(RelFileNode *rel,
BlockNumber oldPage,
Size oldSpaceAvail,
Size spaceNeeded);
extern void MultiRecordFreeSpace(RelFileNode *rel,
BlockNumber minPage,
int nPages,
PageFreeSpaceInfo *pageSpaces);
extern Size GetAvgFSMRequestSize(RelFileNode *rel);
extern void RecordRelationFreeSpace(RelFileNode *rel,
int nPages,
PageFreeSpaceInfo *pageSpaces);
extern BlockNumber GetFreeIndexPage(RelFileNode *rel);
extern void RecordIndexFreeSpace(RelFileNode *rel,
int nPages,
BlockNumber *pages);
extern void FreeSpaceMapTruncateRel(RelFileNode *rel, BlockNumber nblocks);
extern void FreeSpaceMapForgetRel(RelFileNode *rel);
extern void FreeSpaceMapForgetDatabase(Oid dbid);
extern void PrintFreeSpaceMapStatistics(int elevel);
#ifdef FREESPACE_DEBUG
extern void DumpFreeSpace(void);
#endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment