Commit 93ee38ea authored by Peter Geoghegan's avatar Peter Geoghegan

Teach pageinspect about nbtree deduplication.

Add a new bt_metap() column to display the metapage's allequalimage
field.  Also add three new columns to contrib/pageinspect's
bt_page_items() function:

* Add a boolean column ("dead") that displays the LP_DEAD bit value for
each non-pivot tuple.

* Add a TID column ("htid") that displays a single heap TID value for
each tuple.  This is the TID that is returned by BTreeTupleGetHeapTID(),
so comparable values are shown for pivot tuples, plain non-pivot tuples,
and posting list tuples.

* Add a TID array column ("tids") that displays TIDs from each tuple's
posting list, if any.  This works just like the "tids" column from
pageinspect's gin_leafpage_items() function.

No version bump for the pageinspect extension, since there hasn't been a
stable Postgres release since the last version bump (the last bump was
part of commit 58b4cb30).

Author: Peter Geoghegan
Discussion: https://postgr.es/m/CAH2-WzmSMmU2eNvY9+a4MNP+z02h6sa-uxZvN3un6jY02ZVBSw@mail.gmail.com
parent 58c47ccf
......@@ -31,9 +31,11 @@
#include "access/relation.h"
#include "catalog/namespace.h"
#include "catalog/pg_am.h"
#include "catalog/pg_type.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "pageinspect.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/rel.h"
#include "utils/varlena.h"
......@@ -45,6 +47,8 @@ PG_FUNCTION_INFO_V1(bt_page_stats);
#define IS_INDEX(r) ((r)->rd_rel->relkind == RELKIND_INDEX)
#define IS_BTREE(r) ((r)->rd_rel->relam == BTREE_AM_OID)
#define DatumGetItemPointer(X) ((ItemPointer) DatumGetPointer(X))
#define ItemPointerGetDatum(X) PointerGetDatum(X)
/* note: BlockNumber is unsigned, hence can't be negative */
#define CHECK_RELATION_BLOCK_RANGE(rel, blkno) { \
......@@ -243,6 +247,9 @@ struct user_args
{
Page page;
OffsetNumber offset;
bool leafpage;
bool rightmost;
TupleDesc tupd;
};
/*-------------------------------------------------------
......@@ -252,17 +259,25 @@ struct user_args
* ------------------------------------------------------
*/
static Datum
bt_page_print_tuples(FuncCallContext *fctx, Page page, OffsetNumber offset)
bt_page_print_tuples(FuncCallContext *fctx, struct user_args *uargs)
{
char *values[6];
Page page = uargs->page;
OffsetNumber offset = uargs->offset;
bool leafpage = uargs->leafpage;
bool rightmost = uargs->rightmost;
bool ispivottuple;
Datum values[9];
bool nulls[9];
HeapTuple tuple;
ItemId id;
IndexTuple itup;
int j;
int off;
int dlen;
char *dump;
char *dump,
*datacstring;
char *ptr;
ItemPointer htid;
id = PageGetItemId(page, offset);
......@@ -272,18 +287,49 @@ bt_page_print_tuples(FuncCallContext *fctx, Page page, OffsetNumber offset)
itup = (IndexTuple) PageGetItem(page, id);
j = 0;
values[j++] = psprintf("%d", offset);
values[j++] = psprintf("(%u,%u)",
ItemPointerGetBlockNumberNoCheck(&itup->t_tid),
ItemPointerGetOffsetNumberNoCheck(&itup->t_tid));
values[j++] = psprintf("%d", (int) IndexTupleSize(itup));
values[j++] = psprintf("%c", IndexTupleHasNulls(itup) ? 't' : 'f');
values[j++] = psprintf("%c", IndexTupleHasVarwidths(itup) ? 't' : 'f');
memset(nulls, 0, sizeof(nulls));
values[j++] = DatumGetInt16(offset);
values[j++] = ItemPointerGetDatum(&itup->t_tid);
values[j++] = Int32GetDatum((int) IndexTupleSize(itup));
values[j++] = BoolGetDatum(IndexTupleHasNulls(itup));
values[j++] = BoolGetDatum(IndexTupleHasVarwidths(itup));
ptr = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
dlen = IndexTupleSize(itup) - IndexInfoFindDataOffset(itup->t_info);
/*
* Make sure that "data" column does not include posting list or pivot
* tuple representation of heap TID(s).
*
* Note: BTreeTupleIsPivot() won't work reliably on !heapkeyspace indexes
* (those built before BTREE_VERSION 4), but we have no way of determining
* if this page came from a !heapkeyspace index. We may only have a bytea
* nbtree page image to go on, so in general there is no metapage that we
* can check.
*
* That's okay here because BTreeTupleIsPivot() can only return false for
* a !heapkeyspace pivot, never true for a !heapkeyspace non-pivot. Since
* heap TID isn't part of the keyspace in a !heapkeyspace index anyway,
* there cannot possibly be a pivot tuple heap TID representation that we
* fail to make an adjustment for. A !heapkeyspace index can have
* BTreeTupleIsPivot() return true (due to things like suffix truncation
* for INCLUDE indexes in Postgres v11), but when that happens
* BTreeTupleGetHeapTID() can be trusted to work reliably (i.e. return
* NULL).
*
* Note: BTreeTupleIsPosting() always works reliably, even with
* !heapkeyspace indexes.
*/
if (BTreeTupleIsPosting(itup))
dlen -= IndexTupleSize(itup) - BTreeTupleGetPostingOffset(itup);
else if (BTreeTupleIsPivot(itup) && BTreeTupleGetHeapTID(itup) != NULL)
dlen -= MAXALIGN(sizeof(ItemPointerData));
if (dlen < 0 || dlen > INDEX_SIZE_MASK)
elog(ERROR, "invalid tuple length %d for tuple at offset number %u",
dlen, offset);
dump = palloc0(dlen * 3 + 1);
values[j] = dump;
datacstring = dump;
for (off = 0; off < dlen; off++)
{
if (off > 0)
......@@ -291,8 +337,62 @@ bt_page_print_tuples(FuncCallContext *fctx, Page page, OffsetNumber offset)
sprintf(dump, "%02x", *(ptr + off) & 0xff);
dump += 2;
}
values[j++] = CStringGetTextDatum(datacstring);
pfree(datacstring);
/*
* We need to work around the BTreeTupleIsPivot() !heapkeyspace limitation
* again. Deduce whether or not tuple must be a pivot tuple based on
* whether or not the page is a leaf page, as well as the page offset
* number of the tuple.
*/
ispivottuple = (!leafpage || (!rightmost && offset == P_HIKEY));
/* LP_DEAD bit can never be set for pivot tuples, so show a NULL there */
if (!ispivottuple)
values[j++] = BoolGetDatum(ItemIdIsDead(id));
else
{
Assert(!ItemIdIsDead(id));
nulls[j++] = true;
}
htid = BTreeTupleGetHeapTID(itup);
if (ispivottuple && !BTreeTupleIsPivot(itup))
{
/* Don't show bogus heap TID in !heapkeyspace pivot tuple */
htid = NULL;
}
if (htid)
values[j++] = ItemPointerGetDatum(htid);
else
nulls[j++] = true;
if (BTreeTupleIsPosting(itup))
{
/* Build an array of item pointers */
ItemPointer tids;
Datum *tids_datum;
int nposting;
tids = BTreeTupleGetPosting(itup);
nposting = BTreeTupleGetNPosting(itup);
tids_datum = (Datum *) palloc(nposting * sizeof(Datum));
for (int i = 0; i < nposting; i++)
tids_datum[i] = ItemPointerGetDatum(&tids[i]);
values[j++] = PointerGetDatum(construct_array(tids_datum,
nposting,
TIDOID,
sizeof(ItemPointerData),
false, 's'));
pfree(tids_datum);
}
else
nulls[j++] = true;
tuple = BuildTupleFromCStrings(fctx->attinmeta, values);
/* Build and return the result tuple */
tuple = heap_form_tuple(uargs->tupd, values, nulls);
return HeapTupleGetDatum(tuple);
}
......@@ -378,12 +478,15 @@ bt_page_items(PG_FUNCTION_ARGS)
elog(NOTICE, "page is deleted");
fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
uargs->leafpage = P_ISLEAF(opaque);
uargs->rightmost = P_RIGHTMOST(opaque);
/* Build a tuple descriptor for our result type */
if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
tupleDesc = BlessTupleDesc(tupleDesc);
fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc);
uargs->tupd = tupleDesc;
fctx->user_fctx = uargs;
......@@ -395,7 +498,7 @@ bt_page_items(PG_FUNCTION_ARGS)
if (fctx->call_cntr < fctx->max_calls)
{
result = bt_page_print_tuples(fctx, uargs->page, uargs->offset);
result = bt_page_print_tuples(fctx, uargs);
uargs->offset++;
SRF_RETURN_NEXT(fctx, result);
}
......@@ -463,12 +566,15 @@ bt_page_items_bytea(PG_FUNCTION_ARGS)
elog(NOTICE, "page is deleted");
fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
uargs->leafpage = P_ISLEAF(opaque);
uargs->rightmost = P_RIGHTMOST(opaque);
/* Build a tuple descriptor for our result type */
if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
tupleDesc = BlessTupleDesc(tupleDesc);
fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc);
uargs->tupd = tupleDesc;
fctx->user_fctx = uargs;
......@@ -480,7 +586,7 @@ bt_page_items_bytea(PG_FUNCTION_ARGS)
if (fctx->call_cntr < fctx->max_calls)
{
result = bt_page_print_tuples(fctx, uargs->page, uargs->offset);
result = bt_page_print_tuples(fctx, uargs);
uargs->offset++;
SRF_RETURN_NEXT(fctx, result);
}
......@@ -510,7 +616,7 @@ bt_metap(PG_FUNCTION_ARGS)
BTMetaPageData *metad;
TupleDesc tupleDesc;
int j;
char *values[8];
char *values[9];
Buffer buffer;
Page page;
HeapTuple tuple;
......@@ -557,17 +663,21 @@ bt_metap(PG_FUNCTION_ARGS)
/*
* Get values of extended metadata if available, use default values
* otherwise.
* otherwise. Note that we rely on the assumption that btm_allequalimage
* is initialized to zero with indexes that were built on versions prior
* to Postgres 13 (just like _bt_metaversion()).
*/
if (metad->btm_version >= BTREE_NOVAC_VERSION)
{
values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact);
values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples);
values[j++] = metad->btm_allequalimage ? "t" : "f";
}
else
{
values[j++] = "0";
values[j++] = "-1";
values[j++] = "f";
}
tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
......
......@@ -12,6 +12,7 @@ fastroot | 1
fastlevel | 0
oldest_xact | 0
last_cleanup_num_tuples | -1
allequalimage | t
SELECT * FROM bt_page_stats('test1_a_idx', 0);
ERROR: block 0 is a meta page
......@@ -41,6 +42,9 @@ itemlen | 16
nulls | f
vars | f
data | 01 00 00 00 00 00 00 01
dead | f
htid | (0,1)
tids |
SELECT * FROM bt_page_items('test1_a_idx', 2);
ERROR: block number out of range
......@@ -54,6 +58,9 @@ itemlen | 16
nulls | f
vars | f
data | 01 00 00 00 00 00 00 01
dead | f
htid | (0,1)
tids |
SELECT * FROM bt_page_items(get_raw_page('test1_a_idx', 2));
ERROR: block number 2 is out of range for relation "test1_a_idx"
......
......@@ -14,3 +14,56 @@ CREATE FUNCTION heap_tuple_infomask_flags(
RETURNS record
AS 'MODULE_PATHNAME', 'heap_tuple_infomask_flags'
LANGUAGE C STRICT PARALLEL SAFE;
--
-- bt_metap()
--
DROP FUNCTION bt_metap(text);
CREATE FUNCTION bt_metap(IN relname text,
OUT magic int4,
OUT version int4,
OUT root int4,
OUT level int4,
OUT fastroot int4,
OUT fastlevel int4,
OUT oldest_xact int4,
OUT last_cleanup_num_tuples real,
OUT allequalimage boolean)
AS 'MODULE_PATHNAME', 'bt_metap'
LANGUAGE C STRICT PARALLEL SAFE;
--
-- bt_page_items(text, int4)
--
DROP FUNCTION bt_page_items(text, int4);
CREATE FUNCTION bt_page_items(IN relname text, IN blkno int4,
OUT itemoffset smallint,
OUT ctid tid,
OUT itemlen smallint,
OUT nulls bool,
OUT vars bool,
OUT data text,
OUT dead boolean,
OUT htid tid,
OUT tids tid[])
RETURNS SETOF record
AS 'MODULE_PATHNAME', 'bt_page_items'
LANGUAGE C STRICT PARALLEL SAFE;
--
-- bt_page_items(bytea)
--
DROP FUNCTION bt_page_items(bytea);
CREATE FUNCTION bt_page_items(IN page bytea,
OUT itemoffset smallint,
OUT ctid tid,
OUT itemlen smallint,
OUT nulls bool,
OUT vars bool,
OUT data text,
OUT dead boolean,
OUT htid tid,
OUT tids tid[])
RETURNS SETOF record
AS 'MODULE_PATHNAME', 'bt_page_items_bytea'
LANGUAGE C STRICT PARALLEL SAFE;
......@@ -300,13 +300,14 @@ test=# SELECT t_ctid, raw_flags, combined_flags
test=# SELECT * FROM bt_metap('pg_cast_oid_index');
-[ RECORD 1 ]-----------+-------
magic | 340322
version | 3
version | 4
root | 1
level | 0
fastroot | 1
fastlevel | 0
oldest_xact | 582
last_cleanup_num_tuples | 1000
allequalimage | f
</screen>
</para>
</listitem>
......@@ -329,11 +330,11 @@ test=# SELECT * FROM bt_page_stats('pg_cast_oid_index', 1);
-[ RECORD 1 ]-+-----
blkno | 1
type | l
live_items | 256
live_items | 224
dead_items | 0
avg_item_size | 12
avg_item_size | 16
page_size | 8192
free_size | 4056
free_size | 3668
btpo_prev | 0
btpo_next | 0
btpo | 0
......@@ -356,33 +357,75 @@ btpo_flags | 3
<function>bt_page_items</function> returns detailed information about
all of the items on a B-tree index page. For example:
<screen>
test=# SELECT * FROM bt_page_items('pg_cast_oid_index', 1);
itemoffset | ctid | itemlen | nulls | vars | data
------------+---------+---------+-------+------+-------------
1 | (0,1) | 12 | f | f | 23 27 00 00
2 | (0,2) | 12 | f | f | 24 27 00 00
3 | (0,3) | 12 | f | f | 25 27 00 00
4 | (0,4) | 12 | f | f | 26 27 00 00
5 | (0,5) | 12 | f | f | 27 27 00 00
6 | (0,6) | 12 | f | f | 28 27 00 00
7 | (0,7) | 12 | f | f | 29 27 00 00
8 | (0,8) | 12 | f | f | 2a 27 00 00
test=# SELECT itemoffset, ctid, itemlen, nulls, vars, data, dead, htid, tids[0:2] AS some_tids
FROM bt_page_items(get_raw_page('tenk2_hundred', 5));
itemoffset | ctid | itemlen | nulls | vars | data | dead | htid | some_tids
------------+-----------+---------+-------+------+-------------------------+------+--------+---------------------
1 | (16,1) | 16 | f | f | 30 00 00 00 00 00 00 00 | | |
2 | (16,8292) | 616 | f | f | 24 00 00 00 00 00 00 00 | f | (1,6) | {"(1,6)","(10,22)"}
3 | (16,8292) | 616 | f | f | 25 00 00 00 00 00 00 00 | f | (1,18) | {"(1,18)","(4,22)"}
4 | (16,8292) | 616 | f | f | 26 00 00 00 00 00 00 00 | f | (4,18) | {"(4,18)","(6,17)"}
5 | (16,8292) | 616 | f | f | 27 00 00 00 00 00 00 00 | f | (1,2) | {"(1,2)","(1,19)"}
6 | (16,8292) | 616 | f | f | 28 00 00 00 00 00 00 00 | f | (2,24) | {"(2,24)","(4,11)"}
7 | (16,8292) | 616 | f | f | 29 00 00 00 00 00 00 00 | f | (2,17) | {"(2,17)","(11,2)"}
8 | (16,8292) | 616 | f | f | 2a 00 00 00 00 00 00 00 | f | (0,25) | {"(0,25)","(3,20)"}
9 | (16,8292) | 616 | f | f | 2b 00 00 00 00 00 00 00 | f | (0,10) | {"(0,10)","(0,14)"}
10 | (16,8292) | 616 | f | f | 2c 00 00 00 00 00 00 00 | f | (1,3) | {"(1,3)","(3,9)"}
11 | (16,8292) | 616 | f | f | 2d 00 00 00 00 00 00 00 | f | (6,28) | {"(6,28)","(11,1)"}
12 | (16,8292) | 616 | f | f | 2e 00 00 00 00 00 00 00 | f | (0,27) | {"(0,27)","(1,13)"}
13 | (16,8292) | 616 | f | f | 2f 00 00 00 00 00 00 00 | f | (4,17) | {"(4,17)","(4,21)"}
(13 rows)
</screen>
In a B-tree leaf page, <structfield>ctid</structfield> points to a heap tuple.
In an internal page, the block number part of <structfield>ctid</structfield>
points to another page in the index itself, while the offset part
(the second number) is ignored and is usually 1.
This is a B-tree leaf page. All tuples that point to the table
happen to be posting list tuples (all of which store a total of
100 6 byte TIDs). There is also a <quote>high key</quote> tuple
at <literal>itemoffset</literal> number 1.
<structfield>ctid</structfield> is used to store encoded
information about each tuple in this example, though leaf page
tuples often store a heap TID directly in the
<structfield>ctid</structfield> field instead.
<structfield>tids</structfield> is the list of TIDs stored as a
posting list.
</para>
<para>
In an internal page (not shown), the block number part of
<structfield>ctid</structfield> is a <quote>downlink</quote>,
which is a block number of another page in the index itself.
The offset part (the second number) of
<structfield>ctid</structfield> stores encoded information about
the tuple, such as the number of columns present (suffix
truncation may have removed unneeded suffix columns). Truncated
columns are treated as having the value <quote>minus
infinity</quote>.
</para>
<para>
<structfield>htid</structfield> shows a heap TID for the tuple,
regardless of the underlying tuple representation. This value
may match <structfield>ctid</structfield>, or may be decoded
from the alternative representations used by posting list tuples
and tuples from internal pages. Tuples in internal pages
usually have the implementation level heap TID column truncated
away, which is represented as a NULL
<structfield>htid</structfield> value.
</para>
<para>
Note that the first item on any non-rightmost page (any page with
a non-zero value in the <structfield>btpo_next</structfield> field) is the
page's <quote>high key</quote>, meaning its <structfield>data</structfield>
serves as an upper bound on all items appearing on the page, while
its <structfield>ctid</structfield> field is meaningless. Also, on non-leaf
pages, the first real data item (the first item that is not a high
key) is a <quote>minus infinity</quote> item, with no actual value
in its <structfield>data</structfield> field. Such an item does have a valid
downlink in its <structfield>ctid</structfield> field, however.
its <structfield>ctid</structfield> field does not point to
another block. Also, on internal pages, the first real data
item (the first item that is not a high key) reliably has every
column truncated away, leaving no actual value in its
<structfield>data</structfield> field. Such an item does have a
valid downlink in its <structfield>ctid</structfield> field,
however.
</para>
<para>
For more details about the structure of B-tree indexes, see
<xref linkend="btree-structure"/>. For more details about
deduplication and posting lists, see <xref
linkend="btree-deduplication"/>.
</para>
</listitem>
</varlistentry>
......@@ -402,17 +445,24 @@ test=# SELECT * FROM bt_page_items('pg_cast_oid_index', 1);
with <function>get_raw_page</function> should be passed as argument. So
the last example could also be rewritten like this:
<screen>
test=# SELECT * FROM bt_page_items(get_raw_page('pg_cast_oid_index', 1));
itemoffset | ctid | itemlen | nulls | vars | data
------------+---------+---------+-------+------+-------------
1 | (0,1) | 12 | f | f | 23 27 00 00
2 | (0,2) | 12 | f | f | 24 27 00 00
3 | (0,3) | 12 | f | f | 25 27 00 00
4 | (0,4) | 12 | f | f | 26 27 00 00
5 | (0,5) | 12 | f | f | 27 27 00 00
6 | (0,6) | 12 | f | f | 28 27 00 00
7 | (0,7) | 12 | f | f | 29 27 00 00
8 | (0,8) | 12 | f | f | 2a 27 00 00
test=# SELECT itemoffset, ctid, itemlen, nulls, vars, data, dead, htid, tids[0:2] AS some_tids
FROM bt_page_items(get_raw_page('tenk2_hundred', 5));
itemoffset | ctid | itemlen | nulls | vars | data | dead | htid | some_tids
------------+-----------+---------+-------+------+-------------------------+------+--------+---------------------
1 | (16,1) | 16 | f | f | 30 00 00 00 00 00 00 00 | | |
2 | (16,8292) | 616 | f | f | 24 00 00 00 00 00 00 00 | f | (1,6) | {"(1,6)","(10,22)"}
3 | (16,8292) | 616 | f | f | 25 00 00 00 00 00 00 00 | f | (1,18) | {"(1,18)","(4,22)"}
4 | (16,8292) | 616 | f | f | 26 00 00 00 00 00 00 00 | f | (4,18) | {"(4,18)","(6,17)"}
5 | (16,8292) | 616 | f | f | 27 00 00 00 00 00 00 00 | f | (1,2) | {"(1,2)","(1,19)"}
6 | (16,8292) | 616 | f | f | 28 00 00 00 00 00 00 00 | f | (2,24) | {"(2,24)","(4,11)"}
7 | (16,8292) | 616 | f | f | 29 00 00 00 00 00 00 00 | f | (2,17) | {"(2,17)","(11,2)"}
8 | (16,8292) | 616 | f | f | 2a 00 00 00 00 00 00 00 | f | (0,25) | {"(0,25)","(3,20)"}
9 | (16,8292) | 616 | f | f | 2b 00 00 00 00 00 00 00 | f | (0,10) | {"(0,10)","(0,14)"}
10 | (16,8292) | 616 | f | f | 2c 00 00 00 00 00 00 00 | f | (1,3) | {"(1,3)","(3,9)"}
11 | (16,8292) | 616 | f | f | 2d 00 00 00 00 00 00 00 | f | (6,28) | {"(6,28)","(11,1)"}
12 | (16,8292) | 616 | f | f | 2e 00 00 00 00 00 00 00 | f | (0,27) | {"(0,27)","(1,13)"}
13 | (16,8292) | 616 | f | f | 2f 00 00 00 00 00 00 00 | f | (4,17) | {"(4,17)","(4,21)"}
(13 rows)
</screen>
All the other details are the same as explained in the previous item.
</para>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment