Commit 857f9c36 authored by Teodor Sigaev's avatar Teodor Sigaev

Skip full index scan during cleanup of B-tree indexes when possible

Vacuum of index consists from two stages: multiple (zero of more) ambulkdelete
calls and one amvacuumcleanup call. When workload on particular table
is append-only, then autovacuum isn't intended to touch this table. However,
user may run vacuum manually in order to fill visibility map and get benefits
of index-only scans. Then ambulkdelete wouldn't be called for indexes
of such table (because no heap tuples were deleted), only amvacuumcleanup would
be called In this case, amvacuumcleanup would perform full index scan for
two objectives: put recyclable pages into free space map and update index
statistics.

This patch allows btvacuumclanup to skip full index scan when two conditions
are satisfied: no pages are going to be put into free space map and index
statistics isn't stalled. In order to check first condition, we store
oldest btpo_xact in the meta-page. When it's precedes RecentGlobalXmin, then
there are some recyclable pages. In order to check second condition we store
number of heap tuples observed during previous full index scan by cleanup.
If fraction of newly inserted tuples is less than
vacuum_cleanup_index_scale_factor, then statistics isn't considered to be
stalled. vacuum_cleanup_index_scale_factor can be defined as both reloption and GUC (default).

This patch bumps B-tree meta-page version. Upgrade of meta-page is performed
"on the fly": during VACUUM meta-page is rewritten with new version. No special
handling in pg_upgrade is required.

Author: Masahiko Sawada, Alexander Korotkov
Review by: Peter Geoghegan, Kyotaro Horiguchi, Alexander Korotkov, Yura Sokolov
Discussion: https://www.postgresql.org/message-id/flat/CAD21AoAX+d2oD_nrd9O2YkpzHaFr=uQeGr9s1rKC3O4ENc568g@mail.gmail.com
parent eac93e20
...@@ -1500,12 +1500,14 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) ...@@ -1500,12 +1500,14 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
errmsg("index \"%s\" meta page is corrupt", errmsg("index \"%s\" meta page is corrupt",
RelationGetRelationName(state->rel)))); RelationGetRelationName(state->rel))));
if (metad->btm_version != BTREE_VERSION) if (metad->btm_version < BTREE_MIN_VERSION ||
metad->btm_version > BTREE_VERSION)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED), (errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("version mismatch in index \"%s\": file version %d, code version %d", errmsg("version mismatch in index \"%s\": file version %d, "
"current version %d, minimal supported version %d",
RelationGetRelationName(state->rel), RelationGetRelationName(state->rel),
metad->btm_version, BTREE_VERSION))); metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
} }
/* /*
......
...@@ -5,7 +5,8 @@ OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o \ ...@@ -5,7 +5,8 @@ OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o \
brinfuncs.o ginfuncs.o hashfuncs.o $(WIN32RES) brinfuncs.o ginfuncs.o hashfuncs.o $(WIN32RES)
EXTENSION = pageinspect EXTENSION = pageinspect
DATA = pageinspect--1.5.sql pageinspect--1.5--1.6.sql \ DATA = pageinspect--1.6--1.7.sql \
pageinspect--1.5.sql pageinspect--1.5--1.6.sql \
pageinspect--1.4--1.5.sql pageinspect--1.3--1.4.sql \ pageinspect--1.4--1.5.sql pageinspect--1.3--1.4.sql \
pageinspect--1.2--1.3.sql pageinspect--1.1--1.2.sql \ pageinspect--1.2--1.3.sql pageinspect--1.1--1.2.sql \
pageinspect--1.0--1.1.sql pageinspect--unpackaged--1.0.sql pageinspect--1.0--1.1.sql pageinspect--unpackaged--1.0.sql
......
...@@ -511,7 +511,7 @@ bt_metap(PG_FUNCTION_ARGS) ...@@ -511,7 +511,7 @@ bt_metap(PG_FUNCTION_ARGS)
BTMetaPageData *metad; BTMetaPageData *metad;
TupleDesc tupleDesc; TupleDesc tupleDesc;
int j; int j;
char *values[6]; char *values[8];
Buffer buffer; Buffer buffer;
Page page; Page page;
HeapTuple tuple; HeapTuple tuple;
...@@ -555,6 +555,8 @@ bt_metap(PG_FUNCTION_ARGS) ...@@ -555,6 +555,8 @@ bt_metap(PG_FUNCTION_ARGS)
values[j++] = psprintf("%d", metad->btm_level); values[j++] = psprintf("%d", metad->btm_level);
values[j++] = psprintf("%d", metad->btm_fastroot); values[j++] = psprintf("%d", metad->btm_fastroot);
values[j++] = psprintf("%d", metad->btm_fastlevel); values[j++] = psprintf("%d", metad->btm_fastlevel);
values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact);
values[j++] = psprintf("%lf", metad->btm_last_cleanup_num_heap_tuples);
tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
values); values);
......
...@@ -3,13 +3,15 @@ INSERT INTO test1 VALUES (72057594037927937, 'text'); ...@@ -3,13 +3,15 @@ INSERT INTO test1 VALUES (72057594037927937, 'text');
CREATE INDEX test1_a_idx ON test1 USING btree (a); CREATE INDEX test1_a_idx ON test1 USING btree (a);
\x \x
SELECT * FROM bt_metap('test1_a_idx'); SELECT * FROM bt_metap('test1_a_idx');
-[ RECORD 1 ]----- -[ RECORD 1 ]-----------+-------
magic | 340322 magic | 340322
version | 2 version | 3
root | 1 root | 1
level | 0 level | 0
fastroot | 1 fastroot | 1
fastlevel | 0 fastlevel | 0
oldest_xact | 0
last_cleanup_num_tuples | -1
SELECT * FROM bt_page_stats('test1_a_idx', 0); SELECT * FROM bt_page_stats('test1_a_idx', 0);
ERROR: block 0 is a meta page ERROR: block 0 is a meta page
......
/* contrib/pageinspect/pageinspect--1.6--1.7.sql */
-- complain if script is sourced in psql, rather than via ALTER EXTENSION
\echo Use "ALTER EXTENSION pageinspect UPDATE TO '1.7'" to load this file. \quit
--
-- bt_metap()
--
DROP FUNCTION bt_metap(IN relname text,
OUT magic int4,
OUT version int4,
OUT root int4,
OUT level int4,
OUT fastroot int4,
OUT fastlevel int4);
CREATE FUNCTION bt_metap(IN relname text,
OUT magic int4,
OUT version int4,
OUT root int4,
OUT level int4,
OUT fastroot int4,
OUT fastlevel int4,
OUT oldest_xact int4,
OUT last_cleanup_num_tuples real)
AS 'MODULE_PATHNAME', 'bt_metap'
LANGUAGE C STRICT PARALLEL SAFE;
# pageinspect extension # pageinspect extension
comment = 'inspect the contents of database pages at a low level' comment = 'inspect the contents of database pages at a low level'
default_version = '1.6' default_version = '1.7'
module_pathname = '$libdir/pageinspect' module_pathname = '$libdir/pageinspect'
relocatable = true relocatable = true
...@@ -48,7 +48,7 @@ select version, tree_level, ...@@ -48,7 +48,7 @@ select version, tree_level,
from pgstatindex('test_pkey'); from pgstatindex('test_pkey');
version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation
---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
(1 row) (1 row)
select version, tree_level, select version, tree_level,
...@@ -58,7 +58,7 @@ select version, tree_level, ...@@ -58,7 +58,7 @@ select version, tree_level,
from pgstatindex('test_pkey'::text); from pgstatindex('test_pkey'::text);
version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation
---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
(1 row) (1 row)
select version, tree_level, select version, tree_level,
...@@ -68,7 +68,7 @@ select version, tree_level, ...@@ -68,7 +68,7 @@ select version, tree_level,
from pgstatindex('test_pkey'::name); from pgstatindex('test_pkey'::name);
version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation
---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
(1 row) (1 row)
select version, tree_level, select version, tree_level,
...@@ -78,7 +78,7 @@ select version, tree_level, ...@@ -78,7 +78,7 @@ select version, tree_level,
from pgstatindex('test_pkey'::regclass); from pgstatindex('test_pkey'::regclass);
version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation
---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
(1 row) (1 row)
select pg_relpages('test'); select pg_relpages('test');
...@@ -229,7 +229,7 @@ create index test_partition_hash_idx on test_partition using hash (a); ...@@ -229,7 +229,7 @@ create index test_partition_hash_idx on test_partition using hash (a);
select pgstatindex('test_partition_idx'); select pgstatindex('test_partition_idx');
pgstatindex pgstatindex
------------------------------ ------------------------------
(2,0,8192,0,0,0,0,0,NaN,NaN) (3,0,8192,0,0,0,0,0,NaN,NaN)
(1 row) (1 row)
select pgstathashindex('test_partition_hash_idx'); select pgstathashindex('test_partition_hash_idx');
......
...@@ -1882,6 +1882,31 @@ include_dir 'conf.d' ...@@ -1882,6 +1882,31 @@ include_dir 'conf.d'
</note> </note>
</sect2> </sect2>
<sect2 id="runtime-config-index-vacuum">
<title>Index Vacuum</title>
<variablelist>
<varlistentry id="guc-vacuum-cleanup-index-scale-factor" xreflabel="vacuum_cleanup_index_scale_factor">
<term><varname>vacuum_cleanup_index_scale_factor</varname> (<type>floating point</type>)
<indexterm>
<primary><varname>vacuum_cleanup_index_scale_factor</varname> configuration parameter</primary>
</indexterm>
</term>
<listitem>
<para>
When no tuples were deleted from the heap, B-tree indexes might still
be scanned during <command>VACUUM</command> cleanup stage by two
reasons. The first reason is that B-tree index contains deleted pages
which can be recycled during cleanup. The second reason is that B-tree
index statistics is stalled. The criterion of stalled index statistics
is number of inserted tuples since previous statistics collection
is greater than <varname>vacuum_cleanup_index_scale_factor</varname>
fraction of total number of heap tuples.
</para>
</listitem>
</varlistentry>
</variablelist>
</sect2>
<sect2 id="runtime-config-resource-background-writer"> <sect2 id="runtime-config-resource-background-writer">
<title>Background Writer</title> <title>Background Writer</title>
......
...@@ -247,13 +247,15 @@ test=# SELECT * FROM heap_page_item_attrs(get_raw_page('pg_class', 0), 'pg_class ...@@ -247,13 +247,15 @@ test=# SELECT * FROM heap_page_item_attrs(get_raw_page('pg_class', 0), 'pg_class
index's metapage. For example: index's metapage. For example:
<screen> <screen>
test=# SELECT * FROM bt_metap('pg_cast_oid_index'); test=# SELECT * FROM bt_metap('pg_cast_oid_index');
-[ RECORD 1 ]----- -[ RECORD 1 ]-----------+-------
magic | 340322 magic | 340322
version | 2 version | 3
root | 1 root | 1
level | 0 level | 0
fastroot | 1 fastroot | 1
fastlevel | 0 fastlevel | 0
oldest_xact | 582
last_cleanup_num_tuples | 1000
</screen> </screen>
</para> </para>
</listitem> </listitem>
......
...@@ -369,6 +369,21 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class= ...@@ -369,6 +369,21 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class=
</varlistentry> </varlistentry>
</variablelist> </variablelist>
<para>
B-tree indexes additionally accept this parameter:
</para>
<variablelist>
<varlistentry>
<term><literal>vacuum_cleanup_index_scale_factor</literal></term>
<listitem>
<para>
Per-table value for <xref linkend="guc-vacuum-cleanup-index-scale-factor"/>.
</para>
</listitem>
</varlistentry>
</variablelist>
<para> <para>
GiST indexes additionally accept this parameter: GiST indexes additionally accept this parameter:
</para> </para>
......
...@@ -409,6 +409,15 @@ static relopt_real realRelOpts[] = ...@@ -409,6 +409,15 @@ static relopt_real realRelOpts[] =
}, },
0, -1.0, DBL_MAX 0, -1.0, DBL_MAX
}, },
{
{
"vacuum_cleanup_index_scale_factor",
"Number of tuple inserts prior to index cleanup as a fraction of reltuples.",
RELOPT_KIND_BTREE,
ShareUpdateExclusiveLock
},
-1, 0.0, 100.0
},
/* list terminator */ /* list terminator */
{{NULL}} {{NULL}}
}; };
...@@ -1371,7 +1380,9 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind) ...@@ -1371,7 +1380,9 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind)
{"user_catalog_table", RELOPT_TYPE_BOOL, {"user_catalog_table", RELOPT_TYPE_BOOL,
offsetof(StdRdOptions, user_catalog_table)}, offsetof(StdRdOptions, user_catalog_table)},
{"parallel_workers", RELOPT_TYPE_INT, {"parallel_workers", RELOPT_TYPE_INT,
offsetof(StdRdOptions, parallel_workers)} offsetof(StdRdOptions, parallel_workers)},
{"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
offsetof(StdRdOptions, vacuum_cleanup_index_scale_factor)}
}; };
options = parseRelOptions(reloptions, validate, kind, &numoptions); options = parseRelOptions(reloptions, validate, kind, &numoptions);
......
...@@ -939,6 +939,9 @@ _bt_insertonpg(Relation rel, ...@@ -939,6 +939,9 @@ _bt_insertonpg(Relation rel,
if (BufferIsValid(metabuf)) if (BufferIsValid(metabuf))
{ {
/* upgrade meta-page if needed */
if (metad->btm_version < BTREE_VERSION)
_bt_upgrademetapage(metapg);
metad->btm_fastroot = itup_blkno; metad->btm_fastroot = itup_blkno;
metad->btm_fastlevel = lpageop->btpo.level; metad->btm_fastlevel = lpageop->btpo.level;
MarkBufferDirty(metabuf); MarkBufferDirty(metabuf);
...@@ -997,6 +1000,9 @@ _bt_insertonpg(Relation rel, ...@@ -997,6 +1000,9 @@ _bt_insertonpg(Relation rel,
xlmeta.level = metad->btm_level; xlmeta.level = metad->btm_level;
xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastroot = metad->btm_fastroot;
xlmeta.fastlevel = metad->btm_fastlevel; xlmeta.fastlevel = metad->btm_fastlevel;
xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
xlmeta.last_cleanup_num_heap_tuples =
metad->btm_last_cleanup_num_heap_tuples;
XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata)); XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
...@@ -2049,6 +2055,10 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) ...@@ -2049,6 +2055,10 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
metapg = BufferGetPage(metabuf); metapg = BufferGetPage(metabuf);
metad = BTPageGetMeta(metapg); metad = BTPageGetMeta(metapg);
/* upgrade metapage if needed */
if (metad->btm_version < BTREE_VERSION)
_bt_upgrademetapage(metapg);
/* /*
* Create downlink item for left page (old root). Since this will be the * Create downlink item for left page (old root). Since this will be the
* first item in a non-leaf page, it implicitly has minus-infinity key * first item in a non-leaf page, it implicitly has minus-infinity key
...@@ -2138,6 +2148,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) ...@@ -2138,6 +2148,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
md.level = metad->btm_level; md.level = metad->btm_level;
md.fastroot = rootblknum; md.fastroot = rootblknum;
md.fastlevel = metad->btm_level; md.fastlevel = metad->btm_level;
md.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
......
...@@ -60,6 +60,8 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) ...@@ -60,6 +60,8 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
metad->btm_level = level; metad->btm_level = level;
metad->btm_fastroot = rootbknum; metad->btm_fastroot = rootbknum;
metad->btm_fastlevel = level; metad->btm_fastlevel = level;
metad->btm_oldest_btpo_xact = InvalidTransactionId;
metad->btm_last_cleanup_num_heap_tuples = -1.0;
metaopaque = (BTPageOpaque) PageGetSpecialPointer(page); metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
metaopaque->btpo_flags = BTP_META; metaopaque->btpo_flags = BTP_META;
...@@ -73,6 +75,114 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) ...@@ -73,6 +75,114 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
((char *) metad + sizeof(BTMetaPageData)) - (char *) page; ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
} }
/*
* _bt_upgrademetapage() -- Upgrade a meta-page from an old format to the new.
*
* This routine does purely in-memory image upgrade. Caller is
* responsible for locking, WAL-logging etc.
*/
void
_bt_upgrademetapage(Page page)
{
BTMetaPageData *metad;
BTPageOpaque metaopaque;
metad = BTPageGetMeta(page);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
/* It must be really a meta page of upgradable version */
Assert(metaopaque->btpo_flags & BTP_META);
Assert(metad->btm_version < BTREE_VERSION);
Assert(metad->btm_version >= BTREE_MIN_VERSION);
/* Set version number and fill extra fields added into version 3 */
metad->btm_version = BTREE_VERSION;
metad->btm_oldest_btpo_xact = InvalidTransactionId;
metad->btm_last_cleanup_num_heap_tuples = -1.0;
/* Adjust pd_lower (see _bt_initmetapage() for details) */
((PageHeader) page)->pd_lower =
((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
}
/*
* _bt_update_meta_cleanup_info() -- Update cleanup-related information in
* the metapage.
*
* This routine checks if provided cleanup-related information is matching
* to those written in the metapage. On mismatch, metapage is overritten.
*/
void
_bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
float8 numHeapTuples)
{
Buffer metabuf;
Page metapg;
BTPageOpaque metaopaque;
BTMetaPageData *metad;
bool needsRewrite = false;
XLogRecPtr recptr;
/* read the metapage and check if it needs rewrite */
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metapg = BufferGetPage(metabuf);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
metad = BTPageGetMeta(metapg);
/* outdated version of metapage always needs rewrite */
if (metad->btm_version < BTREE_VERSION)
needsRewrite = true;
else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
needsRewrite = true;
if (!needsRewrite)
{
_bt_relbuf(rel, metabuf);
return;
}
/* trade in our read lock for a write lock */
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
LockBuffer(metabuf, BT_WRITE);
START_CRIT_SECTION();
/* upgrade meta-page if needed */
if (metad->btm_version < BTREE_VERSION)
_bt_upgrademetapage(metapg);
/* update cleanup-related infromation */
metad->btm_oldest_btpo_xact = oldestBtpoXact;
metad->btm_last_cleanup_num_heap_tuples = numHeapTuples;
MarkBufferDirty(metabuf);
/* write wal record if needed */
if (RelationNeedsWAL(rel))
{
xl_btree_metadata md;
XLogBeginInsert();
XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
md.root = metad->btm_root;
md.level = metad->btm_level;
md.fastroot = metad->btm_fastroot;
md.fastlevel = metad->btm_fastlevel;
md.oldest_btpo_xact = oldestBtpoXact;
md.last_cleanup_num_heap_tuples = numHeapTuples;
XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
PageSetLSN(metapg, recptr);
}
END_CRIT_SECTION();
_bt_relbuf(rel, metabuf);
}
/* /*
* _bt_getroot() -- Get the root page of the btree. * _bt_getroot() -- Get the root page of the btree.
* *
...@@ -124,7 +234,8 @@ _bt_getroot(Relation rel, int access) ...@@ -124,7 +234,8 @@ _bt_getroot(Relation rel, int access)
metad = (BTMetaPageData *) rel->rd_amcache; metad = (BTMetaPageData *) rel->rd_amcache;
/* We shouldn't have cached it if any of these fail */ /* We shouldn't have cached it if any of these fail */
Assert(metad->btm_magic == BTREE_MAGIC); Assert(metad->btm_magic == BTREE_MAGIC);
Assert(metad->btm_version == BTREE_VERSION); Assert(metad->btm_version >= BTREE_MIN_VERSION);
Assert(metad->btm_version <= BTREE_VERSION);
Assert(metad->btm_root != P_NONE); Assert(metad->btm_root != P_NONE);
rootblkno = metad->btm_fastroot; rootblkno = metad->btm_fastroot;
...@@ -170,12 +281,14 @@ _bt_getroot(Relation rel, int access) ...@@ -170,12 +281,14 @@ _bt_getroot(Relation rel, int access)
errmsg("index \"%s\" is not a btree", errmsg("index \"%s\" is not a btree",
RelationGetRelationName(rel)))); RelationGetRelationName(rel))));
if (metad->btm_version != BTREE_VERSION) if (metad->btm_version < BTREE_MIN_VERSION ||
metad->btm_version > BTREE_VERSION)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED), (errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("version mismatch in index \"%s\": file version %d, code version %d", errmsg("version mismatch in index \"%s\": file version %d, "
"current version %d, minimal supported version %d",
RelationGetRelationName(rel), RelationGetRelationName(rel),
metad->btm_version, BTREE_VERSION))); metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
/* if no root page initialized yet, do it */ /* if no root page initialized yet, do it */
if (metad->btm_root == P_NONE) if (metad->btm_root == P_NONE)
...@@ -191,6 +304,10 @@ _bt_getroot(Relation rel, int access) ...@@ -191,6 +304,10 @@ _bt_getroot(Relation rel, int access)
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
LockBuffer(metabuf, BT_WRITE); LockBuffer(metabuf, BT_WRITE);
/* upgrade metapage if needed */
if (metad->btm_version < BTREE_VERSION)
_bt_upgrademetapage(metapg);
/* /*
* Race condition: if someone else initialized the metadata between * Race condition: if someone else initialized the metadata between
* the time we released the read lock and acquired the write lock, we * the time we released the read lock and acquired the write lock, we
...@@ -229,6 +346,8 @@ _bt_getroot(Relation rel, int access) ...@@ -229,6 +346,8 @@ _bt_getroot(Relation rel, int access)
metad->btm_level = 0; metad->btm_level = 0;
metad->btm_fastroot = rootblkno; metad->btm_fastroot = rootblkno;
metad->btm_fastlevel = 0; metad->btm_fastlevel = 0;
metad->btm_oldest_btpo_xact = InvalidTransactionId;
metad->btm_last_cleanup_num_heap_tuples = -1.0;
MarkBufferDirty(rootbuf); MarkBufferDirty(rootbuf);
MarkBufferDirty(metabuf); MarkBufferDirty(metabuf);
...@@ -248,6 +367,8 @@ _bt_getroot(Relation rel, int access) ...@@ -248,6 +367,8 @@ _bt_getroot(Relation rel, int access)
md.level = 0; md.level = 0;
md.fastroot = rootblkno; md.fastroot = rootblkno;
md.fastlevel = 0; md.fastlevel = 0;
md.oldest_btpo_xact = InvalidTransactionId;
md.last_cleanup_num_heap_tuples = -1.0;
XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
...@@ -373,12 +494,14 @@ _bt_gettrueroot(Relation rel) ...@@ -373,12 +494,14 @@ _bt_gettrueroot(Relation rel)
errmsg("index \"%s\" is not a btree", errmsg("index \"%s\" is not a btree",
RelationGetRelationName(rel)))); RelationGetRelationName(rel))));
if (metad->btm_version != BTREE_VERSION) if (metad->btm_version < BTREE_MIN_VERSION ||
metad->btm_version > BTREE_VERSION)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED), (errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("version mismatch in index \"%s\": file version %d, code version %d", errmsg("version mismatch in index \"%s\": file version %d, "
"current version %d, minimal supported version %d",
RelationGetRelationName(rel), RelationGetRelationName(rel),
metad->btm_version, BTREE_VERSION))); metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
/* if no root page initialized yet, fail */ /* if no root page initialized yet, fail */
if (metad->btm_root == P_NONE) if (metad->btm_root == P_NONE)
...@@ -460,12 +583,14 @@ _bt_getrootheight(Relation rel) ...@@ -460,12 +583,14 @@ _bt_getrootheight(Relation rel)
errmsg("index \"%s\" is not a btree", errmsg("index \"%s\" is not a btree",
RelationGetRelationName(rel)))); RelationGetRelationName(rel))));
if (metad->btm_version != BTREE_VERSION) if (metad->btm_version < BTREE_MIN_VERSION ||
metad->btm_version > BTREE_VERSION)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED), (errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("version mismatch in index \"%s\": file version %d, code version %d", errmsg("version mismatch in index \"%s\": file version %d, "
"current version %d, minimal supported version %d",
RelationGetRelationName(rel), RelationGetRelationName(rel),
metad->btm_version, BTREE_VERSION))); metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
/* /*
* If there's no root page yet, _bt_getroot() doesn't expect a cache * If there's no root page yet, _bt_getroot() doesn't expect a cache
...@@ -1784,6 +1909,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) ...@@ -1784,6 +1909,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
/* And update the metapage, if needed */ /* And update the metapage, if needed */
if (BufferIsValid(metabuf)) if (BufferIsValid(metabuf))
{ {
/* upgrade metapage if needed */
if (metad->btm_version < BTREE_VERSION)
_bt_upgrademetapage(metapg);
metad->btm_fastroot = rightsib; metad->btm_fastroot = rightsib;
metad->btm_fastlevel = targetlevel; metad->btm_fastlevel = targetlevel;
MarkBufferDirty(metabuf); MarkBufferDirty(metabuf);
...@@ -1834,6 +1962,8 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) ...@@ -1834,6 +1962,8 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
xlmeta.level = metad->btm_level; xlmeta.level = metad->btm_level;
xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastroot = metad->btm_fastroot;
xlmeta.fastlevel = metad->btm_fastlevel; xlmeta.fastlevel = metad->btm_fastlevel;
xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata)); XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
xlinfo = XLOG_BTREE_UNLINK_PAGE_META; xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
......
...@@ -19,11 +19,14 @@ ...@@ -19,11 +19,14 @@
#include "postgres.h" #include "postgres.h"
#include "access/nbtree.h" #include "access/nbtree.h"
#include "access/nbtxlog.h"
#include "access/relscan.h" #include "access/relscan.h"
#include "access/xlog.h" #include "access/xlog.h"
#include "commands/vacuum.h" #include "commands/vacuum.h"
#include "miscadmin.h"
#include "nodes/execnodes.h" #include "nodes/execnodes.h"
#include "pgstat.h" #include "pgstat.h"
#include "postmaster/autovacuum.h"
#include "storage/condition_variable.h" #include "storage/condition_variable.h"
#include "storage/indexfsm.h" #include "storage/indexfsm.h"
#include "storage/ipc.h" #include "storage/ipc.h"
...@@ -45,6 +48,7 @@ typedef struct ...@@ -45,6 +48,7 @@ typedef struct
BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */ BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */
BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */ BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */
BlockNumber totFreePages; /* true total # of free pages */ BlockNumber totFreePages; /* true total # of free pages */
TransactionId oldestBtpoXact;
MemoryContext pagedelcontext; MemoryContext pagedelcontext;
} BTVacState; } BTVacState;
...@@ -89,7 +93,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc; ...@@ -89,7 +93,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc;
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
IndexBulkDeleteCallback callback, void *callback_state, IndexBulkDeleteCallback callback, void *callback_state,
BTCycleId cycleid); BTCycleId cycleid, TransactionId *oldestBtpoXact);
static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
BlockNumber orig_blkno); BlockNumber orig_blkno);
...@@ -773,6 +777,70 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan) ...@@ -773,6 +777,70 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan)
SpinLockRelease(&btscan->btps_mutex); SpinLockRelease(&btscan->btps_mutex);
} }
/*
* _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that
* btbulkdelete() wasn't called.
*/
static bool
_bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
{
Buffer metabuf;
Page metapg;
BTPageOpaque metaopaque;
BTMetaPageData *metad;
bool result = false;
metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
metapg = BufferGetPage(metabuf);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
metad = BTPageGetMeta(metapg);
if (metad->btm_version < BTREE_VERSION)
{
/*
* Do cleanup if metapage needs upgrade, because we don't have
* cleanup-related meta-information yet.
*/
result = true;
}
else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
TransactionIdPrecedes(metad->btm_oldest_btpo_xact,
RecentGlobalXmin))
{
/*
* If oldest btpo.xact in the deleted pages is older than
* RecentGlobalXmin, then at least one deleted page can be recycled.
*/
result = true;
}
else
{
StdRdOptions *relopts;
float8 cleanup_scale_factor;
/*
* If table receives large enough amount of insertions and no cleanup
* was performed, then index might appear to have stalled statistics.
* In order to evade that, we perform cleanup when table receives
* vacuum_cleanup_index_scale_factor fractions of insertions.
*/
relopts = (StdRdOptions *) info->index->rd_options;
cleanup_scale_factor = (relopts &&
relopts->vacuum_cleanup_index_scale_factor >= 0)
? relopts->vacuum_cleanup_index_scale_factor
: vacuum_cleanup_index_scale_factor;
if (cleanup_scale_factor < 0 ||
metad->btm_last_cleanup_num_heap_tuples < 0 ||
info->num_heap_tuples > (1.0 + cleanup_scale_factor) *
metad->btm_last_cleanup_num_heap_tuples)
result = true;
}
_bt_relbuf(info->index, metabuf);
return result;
}
/* /*
* Bulk deletion of all index entries pointing to a set of heap tuples. * Bulk deletion of all index entries pointing to a set of heap tuples.
* The set of target tuples is specified via a callback routine that tells * The set of target tuples is specified via a callback routine that tells
...@@ -795,9 +863,20 @@ btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, ...@@ -795,9 +863,20 @@ btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
/* The ENSURE stuff ensures we clean up shared memory on failure */ /* The ENSURE stuff ensures we clean up shared memory on failure */
PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
{ {
TransactionId oldestBtpoXact;
cycleid = _bt_start_vacuum(rel); cycleid = _bt_start_vacuum(rel);
btvacuumscan(info, stats, callback, callback_state, cycleid); btvacuumscan(info, stats, callback, callback_state, cycleid,
&oldestBtpoXact);
/*
* Update cleanup-related information in metapage. These information
* is used only for cleanup but keeping up them to date can avoid
* unnecessary cleanup even after bulkdelete.
*/
_bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
info->num_heap_tuples);
} }
PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
_bt_end_vacuum(rel); _bt_end_vacuum(rel);
...@@ -819,17 +898,28 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) ...@@ -819,17 +898,28 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
/* /*
* If btbulkdelete was called, we need not do anything, just return the * If btbulkdelete was called, we need not do anything, just return the
* stats from the latest btbulkdelete call. If it wasn't called, we must * stats from the latest btbulkdelete call. If it wasn't called, we might
* still do a pass over the index, to recycle any newly-recyclable pages * still need to do a pass over the index, to recycle any newly-recyclable
* and to obtain index statistics. * pages and to obtain index statistics. _bt_vacuum_needs_cleanup checks
* is there are newly-recyclable or stalled index statistics.
* *
* Since we aren't going to actually delete any leaf items, there's no * Since we aren't going to actually delete any leaf items, there's no
* need to go through all the vacuum-cycle-ID pushups. * need to go through all the vacuum-cycle-ID pushups.
*/ */
if (stats == NULL) if (stats == NULL)
{ {
TransactionId oldestBtpoXact;
/* Check if we need a cleanup */
if (!_bt_vacuum_needs_cleanup(info))
return NULL;
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
btvacuumscan(info, stats, NULL, NULL, 0); btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact);
/* Update cleanup-related information in the metapage */
_bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
info->num_heap_tuples);
} }
/* /*
...@@ -862,7 +952,7 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) ...@@ -862,7 +952,7 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
static void static void
btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
IndexBulkDeleteCallback callback, void *callback_state, IndexBulkDeleteCallback callback, void *callback_state,
BTCycleId cycleid) BTCycleId cycleid, TransactionId *oldestBtpoXact)
{ {
Relation rel = info->index; Relation rel = info->index;
BTVacState vstate; BTVacState vstate;
...@@ -887,6 +977,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, ...@@ -887,6 +977,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
vstate.lastBlockLocked = BTREE_METAPAGE; vstate.lastBlockLocked = BTREE_METAPAGE;
vstate.totFreePages = 0; vstate.totFreePages = 0;
vstate.oldestBtpoXact = InvalidTransactionId;
/* Create a temporary memory context to run _bt_pagedel in */ /* Create a temporary memory context to run _bt_pagedel in */
vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
...@@ -991,6 +1082,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, ...@@ -991,6 +1082,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
/* update statistics */ /* update statistics */
stats->num_pages = num_pages; stats->num_pages = num_pages;
stats->pages_free = vstate.totFreePages; stats->pages_free = vstate.totFreePages;
if (oldestBtpoXact)
*oldestBtpoXact = vstate.oldestBtpoXact;
} }
/* /*
...@@ -1070,6 +1164,11 @@ restart: ...@@ -1070,6 +1164,11 @@ restart:
{ {
/* Already deleted, but can't recycle yet */ /* Already deleted, but can't recycle yet */
stats->pages_deleted++; stats->pages_deleted++;
/* Update the oldest btpo.xact */
if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
vstate->oldestBtpoXact = opaque->btpo.xact;
} }
else if (P_ISHALFDEAD(opaque)) else if (P_ISHALFDEAD(opaque))
{ {
...@@ -1238,7 +1337,12 @@ restart: ...@@ -1238,7 +1337,12 @@ restart:
/* count only this page, else may double-count parent */ /* count only this page, else may double-count parent */
if (ndel) if (ndel)
{
stats->pages_deleted++; stats->pages_deleted++;
if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
vstate->oldestBtpoXact = opaque->btpo.xact;
}
MemoryContextSwitchTo(oldcontext); MemoryContextSwitchTo(oldcontext);
/* pagedel released buffer, so we shouldn't */ /* pagedel released buffer, so we shouldn't */
......
...@@ -108,6 +108,8 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id) ...@@ -108,6 +108,8 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id)
md->btm_level = xlrec->level; md->btm_level = xlrec->level;
md->btm_fastroot = xlrec->fastroot; md->btm_fastroot = xlrec->fastroot;
md->btm_fastlevel = xlrec->fastlevel; md->btm_fastlevel = xlrec->fastlevel;
md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact;
md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples;
pageop = (BTPageOpaque) PageGetSpecialPointer(metapg); pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
pageop->btpo_flags = BTP_META; pageop->btpo_flags = BTP_META;
...@@ -985,7 +987,6 @@ btree_xlog_reuse_page(XLogReaderState *record) ...@@ -985,7 +987,6 @@ btree_xlog_reuse_page(XLogReaderState *record)
} }
} }
void void
btree_redo(XLogReaderState *record) btree_redo(XLogReaderState *record)
{ {
...@@ -1027,6 +1028,9 @@ btree_redo(XLogReaderState *record) ...@@ -1027,6 +1028,9 @@ btree_redo(XLogReaderState *record)
case XLOG_BTREE_REUSE_PAGE: case XLOG_BTREE_REUSE_PAGE:
btree_xlog_reuse_page(record); btree_xlog_reuse_page(record);
break; break;
case XLOG_BTREE_META_CLEANUP:
_bt_restore_meta(record, 0);
break;
default: default:
elog(PANIC, "btree_redo: unknown op code %u", info); elog(PANIC, "btree_redo: unknown op code %u", info);
} }
......
...@@ -138,3 +138,5 @@ int VacuumPageDirty = 0; ...@@ -138,3 +138,5 @@ int VacuumPageDirty = 0;
int VacuumCostBalance = 0; /* working state for vacuum */ int VacuumCostBalance = 0; /* working state for vacuum */
bool VacuumCostActive = false; bool VacuumCostActive = false;
double vacuum_cleanup_index_scale_factor;
...@@ -3208,6 +3208,16 @@ static struct config_real ConfigureNamesReal[] = ...@@ -3208,6 +3208,16 @@ static struct config_real ConfigureNamesReal[] =
NULL, NULL, NULL NULL, NULL, NULL
}, },
{
{"vacuum_cleanup_index_scale_factor", PGC_SIGHUP, AUTOVACUUM,
gettext_noop("Number of tuple inserts prior to index cleanup as a fraction of reltuples."),
NULL
},
&vacuum_cleanup_index_scale_factor,
0.1, 0.0, 100.0,
NULL, NULL, NULL
},
/* End-of-list marker */ /* End-of-list marker */
{ {
{NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL {NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL
......
...@@ -102,6 +102,11 @@ typedef struct BTMetaPageData ...@@ -102,6 +102,11 @@ typedef struct BTMetaPageData
uint32 btm_level; /* tree level of the root page */ uint32 btm_level; /* tree level of the root page */
BlockNumber btm_fastroot; /* current "fast" root location */ BlockNumber btm_fastroot; /* current "fast" root location */
uint32 btm_fastlevel; /* tree level of the "fast" root page */ uint32 btm_fastlevel; /* tree level of the "fast" root page */
/* following fields are available since page version 3 */
TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among of
* deleted pages */
float4 btm_last_cleanup_num_heap_tuples; /* number of heap tuples
* during last cleanup */
} BTMetaPageData; } BTMetaPageData;
#define BTPageGetMeta(p) \ #define BTPageGetMeta(p) \
...@@ -109,7 +114,8 @@ typedef struct BTMetaPageData ...@@ -109,7 +114,8 @@ typedef struct BTMetaPageData
#define BTREE_METAPAGE 0 /* first page is meta */ #define BTREE_METAPAGE 0 /* first page is meta */
#define BTREE_MAGIC 0x053162 /* magic number of btree pages */ #define BTREE_MAGIC 0x053162 /* magic number of btree pages */
#define BTREE_VERSION 2 /* current version number */ #define BTREE_VERSION 3 /* current version number */
#define BTREE_MIN_VERSION 2 /* minimal supported version number */
/* /*
* Maximum size of a btree index entry, including its tuple header. * Maximum size of a btree index entry, including its tuple header.
...@@ -481,6 +487,9 @@ extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack); ...@@ -481,6 +487,9 @@ extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack);
* prototypes for functions in nbtpage.c * prototypes for functions in nbtpage.c
*/ */
extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level); extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
extern void _bt_update_meta_cleanup_info(Relation rel,
TransactionId oldestBtpoXact, float8 numHeapTuples);
extern void _bt_upgrademetapage(Page page);
extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_getroot(Relation rel, int access);
extern Buffer _bt_gettrueroot(Relation rel); extern Buffer _bt_gettrueroot(Relation rel);
extern int _bt_getrootheight(Relation rel); extern int _bt_getrootheight(Relation rel);
......
...@@ -38,6 +38,8 @@ ...@@ -38,6 +38,8 @@
* vacuum */ * vacuum */
#define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from #define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from
* FSM */ * FSM */
#define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the
* metapage */
/* /*
* All that we need to regenerate the meta-data page * All that we need to regenerate the meta-data page
...@@ -48,6 +50,8 @@ typedef struct xl_btree_metadata ...@@ -48,6 +50,8 @@ typedef struct xl_btree_metadata
uint32 level; uint32 level;
BlockNumber fastroot; BlockNumber fastroot;
uint32 fastlevel; uint32 fastlevel;
TransactionId oldest_btpo_xact;
double last_cleanup_num_heap_tuples;
} xl_btree_metadata; } xl_btree_metadata;
/* /*
......
...@@ -256,6 +256,8 @@ extern int VacuumPageDirty; ...@@ -256,6 +256,8 @@ extern int VacuumPageDirty;
extern int VacuumCostBalance; extern int VacuumCostBalance;
extern bool VacuumCostActive; extern bool VacuumCostActive;
extern double vacuum_cleanup_index_scale_factor;
/* in tcop/postgres.c */ /* in tcop/postgres.c */
......
...@@ -287,6 +287,8 @@ typedef struct StdRdOptions ...@@ -287,6 +287,8 @@ typedef struct StdRdOptions
{ {
int32 vl_len_; /* varlena header (do not touch directly!) */ int32 vl_len_; /* varlena header (do not touch directly!) */
int fillfactor; /* page fill factor in percent (0..100) */ int fillfactor; /* page fill factor in percent (0..100) */
/* fraction of newly inserted tuples prior to trigger index cleanup */
float8 vacuum_cleanup_index_scale_factor;
int toast_tuple_target; /* target for tuple toasting */ int toast_tuple_target; /* target for tuple toasting */
AutoVacOpts autovacuum; /* autovacuum-related options */ AutoVacOpts autovacuum; /* autovacuum-related options */
bool user_catalog_table; /* use as an additional catalog relation */ bool user_catalog_table; /* use as an additional catalog relation */
......
...@@ -150,3 +150,32 @@ vacuum btree_tall_tbl; ...@@ -150,3 +150,32 @@ vacuum btree_tall_tbl;
-- need to insert some rows to cause the fast root page to split. -- need to insert some rows to cause the fast root page to split.
insert into btree_tall_tbl (id, t) insert into btree_tall_tbl (id, t)
select g, repeat('x', 100) from generate_series(1, 500) g; select g, repeat('x', 100) from generate_series(1, 500) g;
--
-- Test vacuum_cleanup_index_scale_factor
--
-- Simple create
create table btree_test(a int);
create index btree_idx1 on btree_test(a) with (vacuum_cleanup_index_scale_factor = 40.0);
select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;
reloptions
------------------------------------------
{vacuum_cleanup_index_scale_factor=40.0}
(1 row)
-- Fail while setting improper values
create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = -10.0);
ERROR: value -10.0 out of bounds for option "vacuum_cleanup_index_scale_factor"
DETAIL: Valid values are between "0.000000" and "100.000000".
create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 100.0);
create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 'string');
ERROR: invalid value for floating point option "vacuum_cleanup_index_scale_factor": string
create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = true);
ERROR: invalid value for floating point option "vacuum_cleanup_index_scale_factor": true
-- Simple ALTER INDEX
alter index btree_idx1 set (vacuum_cleanup_index_scale_factor = 70.0);
select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;
reloptions
------------------------------------------
{vacuum_cleanup_index_scale_factor=70.0}
(1 row)
...@@ -92,3 +92,22 @@ vacuum btree_tall_tbl; ...@@ -92,3 +92,22 @@ vacuum btree_tall_tbl;
-- need to insert some rows to cause the fast root page to split. -- need to insert some rows to cause the fast root page to split.
insert into btree_tall_tbl (id, t) insert into btree_tall_tbl (id, t)
select g, repeat('x', 100) from generate_series(1, 500) g; select g, repeat('x', 100) from generate_series(1, 500) g;
--
-- Test vacuum_cleanup_index_scale_factor
--
-- Simple create
create table btree_test(a int);
create index btree_idx1 on btree_test(a) with (vacuum_cleanup_index_scale_factor = 40.0);
select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;
-- Fail while setting improper values
create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = -10.0);
create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 100.0);
create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 'string');
create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = true);
-- Simple ALTER INDEX
alter index btree_idx1 set (vacuum_cleanup_index_scale_factor = 70.0);
select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment