Commit 7516f525 authored by Alvaro Herrera's avatar Alvaro Herrera

BRIN: Block Range Indexes

BRIN is a new index access method intended to accelerate scans of very
large tables, without the maintenance overhead of btrees or other
traditional indexes.  They work by maintaining "summary" data about
block ranges.  Bitmap index scans work by reading each summary tuple and
comparing them with the query quals; all pages in the range are returned
in a lossy TID bitmap if the quals are consistent with the values in the
summary tuple, otherwise not.  Normal index scans are not supported
because these indexes do not store TIDs.

As new tuples are added into the index, the summary information is
updated (if the block range in which the tuple is added is already
summarized) or not; in the latter case, a subsequent pass of VACUUM or
the brin_summarize_new_values() function will create the summary
information.

For data types with natural 1-D sort orders, the summary info consists
of the maximum and the minimum values of each indexed column within each
page range.  This type of operator class we call "Minmax", and we
supply a bunch of them for most data types with B-tree opclasses.
Since the BRIN code is generalized, other approaches are possible for
things such as arrays, geometric types, ranges, etc; even for things
such as enum types we could do something different than minmax with
better results.  In this commit I only include minmax.

Catalog version bumped due to new builtin catalog entries.

There's more that could be done here, but this is a good step forwards.

Loosely based on ideas from Simon Riggs; code mostly by Álvaro Herrera,
with contribution by Heikki Linnakangas.

Patch reviewed by: Amit Kapila, Heikki Linnakangas, Robert Haas.
Testing help from Jeff Janes, Erik Rijkers, Emanuel Calvo.

PS:
  The research leading to these results has received funding from the
  European Union's Seventh Framework Programme (FP7/2007-2013) under
  grant agreement n° 318633.
parent 1961b1c1
# contrib/pageinspect/Makefile
MODULE_big = pageinspect
OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o $(WIN32RES)
OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o brinfuncs.o $(WIN32RES)
EXTENSION = pageinspect
DATA = pageinspect--1.2.sql pageinspect--1.0--1.1.sql \
DATA = pageinspect--1.3.sql pageinspect--1.0--1.1.sql \
pageinspect--1.2--1.3.sql \
pageinspect--1.1--1.2.sql pageinspect--unpackaged--1.0.sql
PGFILEDESC = "pageinspect - functions to inspect contents of database pages"
......
/*
* brinfuncs.c
* Functions to investigate BRIN indexes
*
* Copyright (c) 2014, PostgreSQL Global Development Group
*
* IDENTIFICATION
* contrib/pageinspect/brinfuncs.c
*/
#include "postgres.h"
#include "access/htup_details.h"
#include "access/brin.h"
#include "access/brin_internal.h"
#include "access/brin_page.h"
#include "access/brin_revmap.h"
#include "access/brin_tuple.h"
#include "catalog/index.h"
#include "catalog/pg_type.h"
#include "funcapi.h"
#include "lib/stringinfo.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "miscadmin.h"
PG_FUNCTION_INFO_V1(brin_page_type);
PG_FUNCTION_INFO_V1(brin_page_items);
PG_FUNCTION_INFO_V1(brin_metapage_info);
PG_FUNCTION_INFO_V1(brin_revmap_data);
typedef struct brin_column_state
{
int nstored;
FmgrInfo outputFn[FLEXIBLE_ARRAY_MEMBER];
} brin_column_state;
typedef struct brin_page_state
{
BrinDesc *bdesc;
Page page;
OffsetNumber offset;
bool unusedItem;
bool done;
AttrNumber attno;
BrinMemTuple *dtup;
brin_column_state *columns[FLEXIBLE_ARRAY_MEMBER];
} brin_page_state;
static Page verify_brin_page(bytea *raw_page, uint16 type,
const char *strtype);
Datum
brin_page_type(PG_FUNCTION_ARGS)
{
bytea *raw_page = PG_GETARG_BYTEA_P(0);
Page page = VARDATA(raw_page);
BrinSpecialSpace *special;
char *type;
special = (BrinSpecialSpace *) PageGetSpecialPointer(page);
switch (special->type)
{
case BRIN_PAGETYPE_META:
type = "meta";
break;
case BRIN_PAGETYPE_REVMAP:
type = "revmap";
break;
case BRIN_PAGETYPE_REGULAR:
type = "regular";
break;
default:
type = psprintf("unknown (%02x)", special->type);
break;
}
PG_RETURN_TEXT_P(cstring_to_text(type));
}
/*
* Verify that the given bytea contains a BRIN page of the indicated page
* type, or die in the attempt. A pointer to the page is returned.
*/
static Page
verify_brin_page(bytea *raw_page, uint16 type, const char *strtype)
{
Page page;
int raw_page_size;
BrinSpecialSpace *special;
raw_page_size = VARSIZE(raw_page) - VARHDRSZ;
if (raw_page_size < SizeOfPageHeaderData)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("input page too small"),
errdetail("Expected size %d, got %d", raw_page_size, BLCKSZ)));
page = VARDATA(raw_page);
/* verify the special space says this page is what we want */
special = (BrinSpecialSpace *) PageGetSpecialPointer(page);
if (special->type != type)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("page is not a BRIN page of type \"%s\"", strtype),
errdetail("Expected special type %08x, got %08x.",
type, special->type)));
return page;
}
/*
* Extract all item values from a BRIN index page
*
* Usage: SELECT * FROM brin_page_items(get_raw_page('idx', 1), 'idx'::regclass);
*/
Datum
brin_page_items(PG_FUNCTION_ARGS)
{
brin_page_state *state;
FuncCallContext *fctx;
if (!superuser())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
(errmsg("must be superuser to use raw page functions"))));
if (SRF_IS_FIRSTCALL())
{
bytea *raw_page = PG_GETARG_BYTEA_P(0);
Oid indexRelid = PG_GETARG_OID(1);
Page page;
TupleDesc tupdesc;
MemoryContext mctx;
Relation indexRel;
AttrNumber attno;
/* minimally verify the page we got */
page = verify_brin_page(raw_page, BRIN_PAGETYPE_REGULAR, "regular");
/* create a function context for cross-call persistence */
fctx = SRF_FIRSTCALL_INIT();
/* switch to memory context appropriate for multiple function calls */
mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
/* Build a tuple descriptor for our result type */
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
indexRel = index_open(indexRelid, AccessShareLock);
state = palloc(offsetof(brin_page_state, columns) +
sizeof(brin_column_state) * RelationGetDescr(indexRel)->natts);
state->bdesc = brin_build_desc(indexRel);
state->page = page;
state->offset = FirstOffsetNumber;
state->unusedItem = false;
state->done = false;
state->dtup = NULL;
/*
* Initialize output functions for all indexed datatypes; simplifies
* calling them later.
*/
for (attno = 1; attno <= state->bdesc->bd_tupdesc->natts; attno++)
{
Oid output;
bool isVarlena;
BrinOpcInfo *opcinfo;
int i;
brin_column_state *column;
opcinfo = state->bdesc->bd_info[attno - 1];
column = palloc(offsetof(brin_column_state, outputFn) +
sizeof(FmgrInfo) * opcinfo->oi_nstored);
column->nstored = opcinfo->oi_nstored;
for (i = 0; i < opcinfo->oi_nstored; i++)
{
getTypeOutputInfo(opcinfo->oi_typids[i], &output, &isVarlena);
fmgr_info(output, &column->outputFn[i]);
}
state->columns[attno - 1] = column;
}
index_close(indexRel, AccessShareLock);
fctx->user_fctx = state;
fctx->tuple_desc = BlessTupleDesc(tupdesc);
MemoryContextSwitchTo(mctx);
}
fctx = SRF_PERCALL_SETUP();
state = fctx->user_fctx;
if (!state->done)
{
HeapTuple result;
Datum values[7];
bool nulls[7];
/*
* This loop is called once for every attribute of every tuple in the
* page. At the start of a tuple, we get a NULL dtup; that's our
* signal for obtaining and decoding the next one. If that's not the
* case, we output the next attribute.
*/
if (state->dtup == NULL)
{
BrinTuple *tup;
MemoryContext mctx;
ItemId itemId;
/* deformed tuple must live across calls */
mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
/* verify item status: if there's no data, we can't decode */
itemId = PageGetItemId(state->page, state->offset);
if (ItemIdIsUsed(itemId))
{
tup = (BrinTuple *) PageGetItem(state->page,
PageGetItemId(state->page,
state->offset));
state->dtup = brin_deform_tuple(state->bdesc, tup);
state->attno = 1;
state->unusedItem = false;
}
else
state->unusedItem = true;
MemoryContextSwitchTo(mctx);
}
else
state->attno++;
MemSet(nulls, 0, sizeof(nulls));
if (state->unusedItem)
{
values[0] = UInt16GetDatum(state->offset);
nulls[1] = true;
nulls[2] = true;
nulls[3] = true;
nulls[4] = true;
nulls[5] = true;
nulls[6] = true;
}
else
{
int att = state->attno - 1;
values[0] = UInt16GetDatum(state->offset);
values[1] = UInt32GetDatum(state->dtup->bt_blkno);
values[2] = UInt16GetDatum(state->attno);
values[3] = BoolGetDatum(state->dtup->bt_columns[att].bv_allnulls);
values[4] = BoolGetDatum(state->dtup->bt_columns[att].bv_hasnulls);
values[5] = BoolGetDatum(state->dtup->bt_placeholder);
if (!state->dtup->bt_columns[att].bv_allnulls)
{
BrinValues *bvalues = &state->dtup->bt_columns[att];
StringInfoData s;
bool first;
int i;
initStringInfo(&s);
appendStringInfoChar(&s, '{');
first = true;
for (i = 0; i < state->columns[att]->nstored; i++)
{
char *val;
if (!first)
appendStringInfoString(&s, " .. ");
first = false;
val = OutputFunctionCall(&state->columns[att]->outputFn[i],
bvalues->bv_values[i]);
appendStringInfoString(&s, val);
pfree(val);
}
appendStringInfoChar(&s, '}');
values[6] = CStringGetTextDatum(s.data);
pfree(s.data);
}
else
{
nulls[6] = true;
}
}
result = heap_form_tuple(fctx->tuple_desc, values, nulls);
/*
* If the item was unused, jump straight to the next one; otherwise,
* the only cleanup needed here is to set our signal to go to the next
* tuple in the following iteration, by freeing the current one.
*/
if (state->unusedItem)
state->offset = OffsetNumberNext(state->offset);
else if (state->attno >= state->bdesc->bd_tupdesc->natts)
{
pfree(state->dtup);
state->dtup = NULL;
state->offset = OffsetNumberNext(state->offset);
}
/*
* If we're beyond the end of the page, set flag to end the function in
* the following iteration.
*/
if (state->offset > PageGetMaxOffsetNumber(state->page))
state->done = true;
SRF_RETURN_NEXT(fctx, HeapTupleGetDatum(result));
}
brin_free_desc(state->bdesc);
SRF_RETURN_DONE(fctx);
}
Datum
brin_metapage_info(PG_FUNCTION_ARGS)
{
bytea *raw_page = PG_GETARG_BYTEA_P(0);
Page page;
BrinMetaPageData *meta;
TupleDesc tupdesc;
Datum values[4];
bool nulls[4];
HeapTuple htup;
page = verify_brin_page(raw_page, BRIN_PAGETYPE_META, "metapage");
/* Build a tuple descriptor for our result type */
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
tupdesc = BlessTupleDesc(tupdesc);
/* Extract values from the metapage */
meta = (BrinMetaPageData *) PageGetContents(page);
MemSet(nulls, 0, sizeof(nulls));
values[0] = CStringGetTextDatum(psprintf("0x%08X", meta->brinMagic));
values[1] = Int32GetDatum(meta->brinVersion);
values[2] = Int32GetDatum(meta->pagesPerRange);
values[3] = Int64GetDatum(meta->lastRevmapPage);
htup = heap_form_tuple(tupdesc, values, nulls);
PG_RETURN_DATUM(HeapTupleGetDatum(htup));
}
/*
* Return the TID array stored in a BRIN revmap page
*/
Datum
brin_revmap_data(PG_FUNCTION_ARGS)
{
struct
{
ItemPointerData *tids;
int idx;
} *state;
FuncCallContext *fctx;
if (!superuser())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
(errmsg("must be superuser to use raw page functions"))));
if (SRF_IS_FIRSTCALL())
{
bytea *raw_page = PG_GETARG_BYTEA_P(0);
MemoryContext mctx;
Page page;
/* minimally verify the page we got */
page = verify_brin_page(raw_page, BRIN_PAGETYPE_REVMAP, "revmap");
/* create a function context for cross-call persistence */
fctx = SRF_FIRSTCALL_INIT();
/* switch to memory context appropriate for multiple function calls */
mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
state = palloc(sizeof(*state));
state->tids = ((RevmapContents *) PageGetContents(page))->rm_tids;
state->idx = 0;
fctx->user_fctx = state;
MemoryContextSwitchTo(mctx);
}
fctx = SRF_PERCALL_SETUP();
state = fctx->user_fctx;
if (state->idx < REVMAP_PAGE_MAXITEMS)
SRF_RETURN_NEXT(fctx, PointerGetDatum(&state->tids[state->idx++]));
SRF_RETURN_DONE(fctx);
}
/* contrib/pageinspect/pageinspect--1.2--1.3.sql */
-- complain if script is sourced in psql, rather than via ALTER EXTENSION
\echo Use "ALTER EXTENSION pageinspect UPDATE TO '1.3'" to load this file. \quit
--
-- brin_page_type()
--
CREATE FUNCTION brin_page_type(IN page bytea)
RETURNS text
AS 'MODULE_PATHNAME', 'brin_page_type'
LANGUAGE C STRICT;
--
-- brin_metapage_info()
--
CREATE FUNCTION brin_metapage_info(IN page bytea, OUT magic text,
OUT version integer, OUT pagesperrange integer, OUT lastrevmappage bigint)
AS 'MODULE_PATHNAME', 'brin_metapage_info'
LANGUAGE C STRICT;
--
-- brin_revmap_data()
CREATE FUNCTION brin_revmap_data(IN page bytea,
OUT pages tid)
RETURNS SETOF tid
AS 'MODULE_PATHNAME', 'brin_revmap_data'
LANGUAGE C STRICT;
--
-- brin_page_items()
--
CREATE FUNCTION brin_page_items(IN page bytea, IN index_oid regclass,
OUT itemoffset int,
OUT blknum int,
OUT attnum int,
OUT allnulls bool,
OUT hasnulls bool,
OUT placeholder bool,
OUT value text)
RETURNS SETOF record
AS 'MODULE_PATHNAME', 'brin_page_items'
LANGUAGE C STRICT;
/* contrib/pageinspect/pageinspect--1.2.sql */
/* contrib/pageinspect/pageinspect--1.3.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION pageinspect" to load this file. \quit
......@@ -98,6 +98,45 @@ RETURNS SETOF record
AS 'MODULE_PATHNAME', 'bt_page_items'
LANGUAGE C STRICT;
--
-- brin_page_type()
--
CREATE FUNCTION brin_page_type(IN page bytea)
RETURNS text
AS 'MODULE_PATHNAME', 'brin_page_type'
LANGUAGE C STRICT;
--
-- brin_metapage_info()
--
CREATE FUNCTION brin_metapage_info(IN page bytea, OUT magic text,
OUT version integer, OUT pagesperrange integer, OUT lastrevmappage bigint)
AS 'MODULE_PATHNAME', 'brin_metapage_info'
LANGUAGE C STRICT;
--
-- brin_revmap_data()
CREATE FUNCTION brin_revmap_data(IN page bytea,
OUT pages tid)
RETURNS SETOF tid
AS 'MODULE_PATHNAME', 'brin_revmap_data'
LANGUAGE C STRICT;
--
-- brin_page_items()
--
CREATE FUNCTION brin_page_items(IN page bytea, IN index_oid regclass,
OUT itemoffset int,
OUT blknum int,
OUT attnum int,
OUT allnulls bool,
OUT hasnulls bool,
OUT placeholder bool,
OUT value text)
RETURNS SETOF record
AS 'MODULE_PATHNAME', 'brin_page_items'
LANGUAGE C STRICT;
--
-- fsm_page_contents()
--
......
# pageinspect extension
comment = 'inspect the contents of database pages at a low level'
default_version = '1.2'
default_version = '1.3'
module_pathname = '$libdir/pageinspect'
relocatable = true
......@@ -8,6 +8,7 @@
#define FRONTEND 1
#include "postgres.h"
#include "access/brin_xlog.h"
#include "access/clog.h"
#include "access/gin.h"
#include "access/gist_private.h"
......
<!-- doc/src/sgml/brin.sgml -->
<chapter id="BRIN">
<title>BRIN Indexes</title>
<indexterm>
<primary>index</primary>
<secondary>BRIN</secondary>
</indexterm>
<sect1 id="brin-intro">
<title>Introduction</title>
<para>
<acronym>BRIN</acronym> stands for Block Range Index.
<acronym>BRIN</acronym> is designed for handling very large tables
in which certain columns have some natural correlation with their
physical location within the table.
A <firstterm>block range</> is a group of pages that are physically
adjacent in the table; for each block range, some summary info is stored
by the index.
For example, a table storing a store's sale orders might have
a date column on which each order was placed, and most of the time
the entries for earlier orders will appear earlier in the table as well;
a table storing a ZIP code column might have all codes for a city
grouped together naturally.
</para>
<para>
<acronym>BRIN</acronym> indexes can satisfy queries via regular bitmap
index scans, and will return all tuples in all pages within each range if
the summary info stored by the index is <firstterm>consistent</> with the
query conditions.
The query executor is in charge of rechecking these tuples and discarding
those that do not match the query conditions &mdash; in other words, these
indexes are lossy.
Because a <acronym>BRIN</acronym> index is very small, scanning the index
adds little overhead compared to a sequential scan, but may avoid scanning
large parts of the table that are known not to contain matching tuples.
</para>
<para>
The specific data that a <acronym>BRIN</acronym> index will store,
as well as the specific queries that the index will be able to satisfy,
depend on the operator class selected for each column of the index.
Data types having a linear sort order can have operator classes that
store the minimum and maximum value within each block range, for instance;
geometrical types might store the bounding box for all the objects
in the block range.
</para>
<para>
The size of the block range is determined at index creation time by
the <literal>pages_per_range</> storage parameter. The number of index
entries will be equal to the size of the relation in pages divided by
the selected value for <literal>pages_per_range</>. Therefore, the smaller
the number, the larger the index becomes (because of the need to
store more index entries), but at the same time the summary data stored can
be more precise and more data blocks can be skipped during an index scan.
</para>
</sect1>
<sect1 id="brin-builtin-opclasses">
<title>Built-in Operator Classes</title>
<para>
The core <productname>PostgreSQL</productname> distribution includes
includes the <acronym>BRIN</acronym> operator classes shown in
<xref linkend="brin-builtin-opclasses-table">.
</para>
<para>
The <firstterm>minmax</>
operator classes store the minimum and the maximum values appearing
in the indexed column within the range.
</para>
<table id="brin-builtin-opclasses-table">
<title>Built-in <acronym>BRIN</acronym> Operator Classes</title>
<tgroup cols="3">
<thead>
<row>
<entry>Name</entry>
<entry>Indexed Data Type</entry>
<entry>Indexable Operators</entry>
</row>
</thead>
<tbody>
<row>
<entry><literal>bytea_minmax_ops</literal></entry>
<entry><type>bytea</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>char_minmax_ops</literal></entry>
<entry><type>"char"</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>name_minmax_ops</literal></entry>
<entry><type>name</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>int8_minmax_ops</literal></entry>
<entry><type>bigint</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>int2_minmax_ops</literal></entry>
<entry><type>smallint</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>int4_minmax_ops</literal></entry>
<entry><type>integer</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>text_minmax_ops</literal></entry>
<entry><type>text</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>oid_minmax_ops</literal></entry>
<entry><type>oid</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>tid_minmax_ops</literal></entry>
<entry><type>tid</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>float4_minmax_ops</literal></entry>
<entry><type>real</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>float8_minmax_ops</literal></entry>
<entry><type>double precision</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>abstime_minmax_ops</literal></entry>
<entry><type>abstime</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>reltime_minmax_ops</literal></entry>
<entry><type>reltime</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>macaddr_minmax_ops</literal></entry>
<entry><type>macaddr</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>inet_minmax_ops</literal></entry>
<entry><type>inet</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>bpchar_minmax_ops</literal></entry>
<entry><type>character</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>date_minmax_ops</literal></entry>
<entry><type>date</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>time_minmax_ops</literal></entry>
<entry><type>time without time zone</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>timestamp_minmax_ops</literal></entry>
<entry><type>timestamp without time zone</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>timestamptz_minmax_ops</literal></entry>
<entry><type>timestamp with time zone</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>interval_minmax_ops</literal></entry>
<entry><type>interval</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>timetz_minmax_ops</literal></entry>
<entry><type>time with time zone</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>bit_minmax_ops</literal></entry>
<entry><type>bit</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>varbit_minmax_ops</literal></entry>
<entry><type>bit varying</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>numeric_minmax_ops</literal></entry>
<entry><type>numeric</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>uuid_minmax_ops</literal></entry>
<entry><type>uuid</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
<row>
<entry><literal>pg_lsn_minmax_ops</literal></entry>
<entry><type>pg_lsn</type></entry>
<entry>
<literal>&lt;</literal>
<literal>&lt;=</literal>
<literal>=</literal>
<literal>&gt;=</literal>
<literal>&gt;</literal>
</entry>
</row>
</tbody>
</tgroup>
</table>
</sect1>
<sect1 id="brin-extensibility">
<title>Extensibility</title>
<para>
The <acronym>BRIN</acronym> interface has a high level of abstraction,
requiring the access method implementer only to implement the semantics
of the data type being accessed. The <acronym>BRIN</acronym> layer
itself takes care of concurrency, logging and searching the index structure.
</para>
<para>
All it takes to get a <acronym>BRIN</acronym> access method working is to
implement a few user-defined methods, which define the behavior of
summary values stored in the index and the way they interact with
scan keys.
In short, <acronym>BRIN</acronym> combines
extensibility with generality, code reuse, and a clean interface.
</para>
<para>
There are four methods that an operator class for <acronym>BRIN</acronym>
must provide:
<variablelist>
<varlistentry>
<term><function>BrinOpcInfo *opcInfo(Oid type_oid)</></term>
<listitem>
<para>
Returns internal information about the indexed columns' summary data.
The return value must point to a palloc'd <structname>BrinOpcInfo</>,
which has this definition:
<programlisting>
typedef struct BrinOpcInfo
{
/* Number of columns stored in an index column of this opclass */
uint16 oi_nstored;
/* Opaque pointer for the opclass' private use */
void *oi_opaque;
/* Type IDs of the stored columns */
Oid oi_typids[FLEXIBLE_ARRAY_MEMBER];
} BrinOpcInfo;
</programlisting>
<structname>BrinOpcInfo</>.<structfield>oi_opaque</> can be used by the
operator class routines to pass information between support procedures
during an index scan.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><function>bool consistent(BrinDesc *bdesc, BrinValues *column,
ScanKey key)</function></term>
<listitem>
<para>
Returns whether the ScanKey is consistent with the given indexed
values for a range.
The attribute number to use is passed as part of the scan key.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><function>bool addValue(BrinDesc *bdesc, BrinValues *column,
Datum newval, bool isnull)</function></term>
<listitem>
<para>
Given an index tuple and an indexed value, modifies the indicated
attribute of the tuple so that it additionally represents the new value.
If any modification was done to the tuple, <literal>true</literal> is
returned.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><function>bool unionTuples(BrinDesc *bdesc, BrinValues *a,
BrinValues *b)</function></term>
<listitem>
<para>
Consolidates two index tuples. Given two index tuples, modifies the
indicated attribute of the first of them so that it represents both tuples.
The second tuple is not modified.
</para>
</listitem>
</varlistentry>
</variablelist>
To implement these methods in a generic way, the operator class
defines its own internal support functions.
(For instance, <quote>min/max</> operator classes implements
support functions for the four inequality operators for the data type.)
Additionally, the operator class must supply appropriate
operator entries,
to enable the optimizer to use the index when those operators are
used in queries.
</para>
</sect1>
</chapter>
......@@ -87,6 +87,7 @@
<!ENTITY gist SYSTEM "gist.sgml">
<!ENTITY spgist SYSTEM "spgist.sgml">
<!ENTITY gin SYSTEM "gin.sgml">
<!ENTITY brin SYSTEM "brin.sgml">
<!ENTITY planstats SYSTEM "planstats.sgml">
<!ENTITY indexam SYSTEM "indexam.sgml">
<!ENTITY nls SYSTEM "nls.sgml">
......
......@@ -116,7 +116,8 @@ CREATE INDEX test1_id_index ON test1 (id);
<para>
<productname>PostgreSQL</productname> provides several index types:
B-tree, Hash, GiST, SP-GiST and GIN. Each index type uses a different
B-tree, Hash, GiST, SP-GiST, GIN and BRIN.
Each index type uses a different
algorithm that is best suited to different types of queries.
By default, the <command>CREATE INDEX</command> command creates
B-tree indexes, which fit the most common situations.
......@@ -326,6 +327,39 @@ SELECT * FROM places ORDER BY location <-> point '(101,456)' LIMIT 10;
classes are available in the <literal>contrib</> collection or as separate
projects. For more information see <xref linkend="GIN">.
</para>
<para>
<indexterm>
<primary>index</primary>
<secondary>BRIN</secondary>
</indexterm>
<indexterm>
<primary>BRIN</primary>
<see>index</see>
</indexterm>
BRIN indexes (a shorthand for Block Range indexes)
store summaries about the values stored in consecutive table physical block ranges.
Like GiST, SP-GiST and GIN,
BRIN can support many different indexing strategies,
and the particular operators with which a BRIN index can be used
vary depending on the indexing strategy.
For datatypes that have a linear sort order, the indexed data
corresponds to the minimum and maximum values of the
values in the column for each block range,
which support indexed queries using these operators:
<simplelist>
<member><literal>&lt;</literal></member>
<member><literal>&lt;=</literal></member>
<member><literal>=</literal></member>
<member><literal>&gt;=</literal></member>
<member><literal>&gt;</literal></member>
</simplelist>
The BRIN operator classes included in the standard distribution are
documented in <xref linkend="brin-builtin-opclasses-table">.
For more information see <xref linkend="BRIN">.
</para>
</sect1>
......
......@@ -196,6 +196,110 @@ test=# SELECT * FROM bt_page_items('pg_cast_oid_index', 1);
</listitem>
</varlistentry>
<varlistentry>
<term>
<function>brin_page_type(page bytea) returns text</function>
<indexterm>
<primary>brin_page_type</primary>
</indexterm>
</term>
<listitem>
<para>
<function>brin_page_type</function> returns the page type of the given
<acronym>BRIN</acronym> index page, or throws an error if the page is
not a valid <acronym>BRIN</acronym> page. For example:
<screen>
brintest=# select brin_page_type(get_raw_page('brinidx', 0));
brin_page_type
----------------
meta
</screen>
</para>
</listitem>
</varlistentry>
<varlistentry>
<term>
<function>brin_metapage_info(page bytea) returns record</function>
<indexterm>
<primary>brin_metapage_info</primary>
</indexterm>
</term>
<listitem>
<para>
<function>brin_metapage_info</function> returns assorted information
about a <acronym>BRIN</acronym> index metapage. For example:
<screen>
brintest=# select * from brin_metapage_info(get_raw_page('brinidx', 0));
magic | version | pagesperrange | lastrevmappage
------------+---------+---------------+----------------
0xA8109CFA | 1 | 4 | 2
</screen>
</para>
</listitem>
</varlistentry>
<varlistentry>
<term>
<function>brin_revmap_data(page bytea) returns setof tid</function>
<indexterm>
<primary>brin_revmap_data</primary>
</indexterm>
</term>
<listitem>
<para>
<function>brin_revmap_data</function> returns the list of tuple
identifiers in a <acronym>BRIN</acronym> index range map page.
For example:
<screen>
brintest=# select * from brin_revmap_data(get_raw_page('brinidx', 2)) limit 5;
pages
---------
(6,137)
(6,138)
(6,139)
(6,140)
(6,141)
</screen>
</para>
</listitem>
</varlistentry>
<varlistentry>
<term>
<function>brin_page_items(page bytea, index oid) returns setof record</function>
<indexterm>
<primary>brin_page_items</primary>
</indexterm>
</term>
<listitem>
<para>
<function>brin_page_items</function> returns the data stored in the
<acronym>BRIN</acronym> data page. For example:
<screen>
brintest=# select * from brin_page_items(get_raw_page('brinidx', 5),
brintest(# 'brinidx')
brintest-# order by blknum, attnum limit 6;
itemoffset | blknum | attnum | allnulls | hasnulls | placeholder | value
------------+--------+--------+----------+----------+-------------+--------------
137 | 0 | 1 | t | f | f |
137 | 0 | 2 | f | f | f | {1 .. 88}
138 | 4 | 1 | t | f | f |
138 | 4 | 2 | f | f | f | {89 .. 176}
139 | 8 | 1 | t | f | f |
139 | 8 | 2 | f | f | f | {177 .. 264}
</screen>
The returned columns correspond to the fields in the
<structname>BrinMemTuple</> and <structname>BrinValues</> structs.
See <filename>src/include/access/brin_tuple.h</> for details.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term>
<function>fsm_page_contents(page bytea) returns text</function>
......
......@@ -247,6 +247,7 @@
&gist;
&spgist;
&gin;
&brin;
&storage;
&bki;
&planstats;
......
......@@ -8,6 +8,6 @@ subdir = src/backend/access
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
SUBDIRS = common gin gist hash heap index nbtree rmgrdesc spgist transam
SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist transam
include $(top_srcdir)/src/backend/common.mk
#-------------------------------------------------------------------------
#
# Makefile--
# Makefile for access/brin
#
# IDENTIFICATION
# src/backend/access/brin/Makefile
#
#-------------------------------------------------------------------------
subdir = src/backend/access/brin
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
OBJS = brin.o brin_pageops.o brin_revmap.o brin_tuple.o brin_xlog.o \
brin_minmax.o
include $(top_srcdir)/src/backend/common.mk
Block Range Indexes (BRIN)
==========================
BRIN indexes intend to enable very fast scanning of extremely large tables.
The essential idea of a BRIN index is to keep track of summarizing values in
consecutive groups of heap pages (page ranges); for example, the minimum and
maximum values for datatypes with a btree opclass, or the bounding box for
geometric types. These values can be used to avoid scanning such pages
during a table scan, depending on query quals.
The cost of this is having to update the stored summary values of each page
range as tuples are inserted into them.
Access Method Design
--------------------
Since item pointers are not stored inside indexes of this type, it is not
possible to support the amgettuple interface. Instead, we only provide
amgetbitmap support. The amgetbitmap routine returns a lossy TIDBitmap
comprising all pages in those page ranges that match the query
qualifications. The recheck step in the BitmapHeapScan node prunes tuples
that are not visible according to the query qualifications.
An operator class must have the following entries:
- generic support procedures (pg_amproc), identical to all opclasses:
* "opcinfo" (BRIN_PROCNUM_OPCINFO) initializes a structure for index
creation or scanning
* "addValue" (BRIN_PROCNUM_ADDVALUE) takes an index tuple and a heap item,
and possibly changes the index tuple so that it includes the heap item
values
* "consistent" (BRIN_PROCNUM_CONSISTENT) takes an index tuple and query
quals, and returns whether the index tuple values match the query quals.
* "union" (BRIN_PROCNUM_UNION) takes two index tuples and modifies the first
one so that it represents the union of the two.
Procedure numbers up to 10 are reserved for future expansion.
Additionally, each opclass needs additional support functions:
- Minmax-style operator classes:
* Proc numbers 11-14 are used for the functions implementing inequality
operators for the type, in this order: less than, less or equal,
greater or equal, greater than.
Opclasses using a different design will require different additional procedure
numbers.
Operator classes also need to have operator (pg_amop) entries so that the
optimizer can choose the index to execute queries.
- Minmax-style operator classes:
* The same operators as btree (<=, <, =, >=, >)
Each index tuple stores some NULL bits and some opclass-specified values, which
are stored in a single null bitmask of length twice the number of columns. The
generic NULL bits indicate, for each column:
* bt_hasnulls: Whether there's any NULL value at all in the page range
* bt_allnulls: Whether all values are NULLs in the page range
The opclass-specified values are:
- Minmax-style operator classes
* minimum value across all tuples in the range
* maximum value across all tuples in the range
Note that the addValue and Union support procedures must be careful to
datumCopy() the values they want to store in the in-memory BRIN tuple, and
must pfree() the old copies when replacing older ones. Since some values
referenced from the tuple persist and others go away, there is no
well-defined lifetime for a memory context that would make this automatic.
The Range Map
-------------
To find the index tuple for a particular page range, we have an internal
structure we call the range map, or "revmap" for short. This stores one TID
per page range, which is the address of the index tuple summarizing that
range. Since the map entries are fixed size, it is possible to compute the
address of the range map entry for any given heap page by simple arithmetic.
When a new heap tuple is inserted in a summarized page range, we compare the
existing index tuple with the new heap tuple. If the heap tuple is outside
the summarization data given by the index tuple for any indexed column (or
if the new heap tuple contains null values but the index tuple indicates
there are no nulls), the index is updated with the new values. In many
cases it is possible to update the index tuple in-place, but if the new
index tuple is larger than the old one and there's not enough space in the
page, it is necessary to create a new index tuple with the new values. The
range map can be updated quickly to point to it; the old index tuple is
removed.
If the range map points to an invalid TID, the corresponding page range is
considered to be not summarized. When tuples are added to unsummarized
pages, nothing needs to happen.
To scan a table following a BRIN index, we scan the range map sequentially.
This yields index tuples in ascending page range order. Query quals are
matched to each index tuple; if they match, each page within the page range
is returned as part of the output TID bitmap. If there's no match, they are
skipped. Range map entries returning invalid index TIDs, that is
unsummarized page ranges, are also returned in the TID bitmap.
The revmap is stored in the first few blocks of the index main fork,
immediately following the metapage. Whenever the revmap needs to be
extended by another page, existing tuples in that page are moved to some
other page.
Heap tuples can be removed from anywhere without restriction. It might be
useful to mark the corresponding index tuple somehow, if the heap tuple is
one of the constraining values of the summary data (i.e. either min or max
in the case of a btree-opclass-bearing datatype), so that in the future we
are aware of the need to re-execute summarization on that range, leading to
a possible tightening of the summary values.
Summarization
-------------
At index creation time, the whole table is scanned; for each page range the
summarizing values of each indexed column and nulls bitmap are collected and
stored in the index. The partially-filled page range at the end of the
table is also summarized.
As new tuples get inserted at the end of the table, they may update the
index tuple that summarizes the partial page range at the end. Eventually
that page range is complete and new tuples belong in a new page range that
hasn't yet been summarized. Those insertions do not create a new index
entry; instead, the page range remains unsummarized until later.
Wehn VACUUM is run on the table, all unsummarized page ranges are
summarized. This action can also be invoked by the user via
brin_summarize_new_values(). Both these procedures scan all the
unsummarized ranges, and create a summary tuple. Again, this includes the
partially-filled page range at the end of the table.
Vacuuming
---------
Since no heap TIDs are stored in a BRIN index, it's not necessary to scan the
index when heap tuples are removed. It might be that some summary values can
be tightened if heap tuples have been deleted; but this would represent an
optimization opportunity only, not a correctness issue. It's simpler to
represent this as the need to re-run summarization on the affected page range
rather than "subtracting" values from the existing one. This is not
currently implemented.
Note that if there are no indexes on the table other than the BRIN index,
usage of maintenance_work_mem by vacuum can be decreased significantly, because
no detailed index scan needs to take place (and thus it's not necessary for
vacuum to save TIDs to remove). It's unlikely that BRIN would be the only
indexes in a table, though, because primary keys can be btrees only, and so
we don't implement this optimization.
Optimizer
---------
The optimizer selects the index based on the operator class' pg_amop
entries for the column.
Future improvements
-------------------
* Different-size page ranges?
In the current design, each "index entry" in a BRIN index covers the same
number of pages. There's no hard reason for this; it might make sense to
allow the index to self-tune so that some index entries cover smaller page
ranges, if this allows the summary values to be more compact. This would incur
larger BRIN overhead for the index itself, but might allow better pruning of
page ranges during scan. In the limit of one index tuple per page, the index
itself would occupy too much space, even though we would be able to skip
reading the most heap pages, because the summary values are tight; in the
opposite limit of a single tuple that summarizes the whole table, we wouldn't
be able to prune anything even though the index is very small. This can
probably be made to work by using the range map as an index in itself.
* More compact representation for TIDBitmap?
TIDBitmap is the structure used to represent bitmap scans. The
representation of lossy page ranges is not optimal for our purposes, because
it uses a Bitmapset to represent pages in the range; since we're going to return
all pages in a large range, it might be more convenient to allow for a
struct that uses start and end page numbers to represent the range, instead.
* Better vacuuming?
It might be useful to enable passing more useful info to BRIN indexes during
vacuuming about tuples that are deleted, i.e. do not require the callback to
pass each tuple's TID. For instance we might need a callback that passes a
block number instead of a TID. That would help determine when to re-run
summarization on blocks that have seen lots of tuple deletions.
/*
* brin.c
* Implementation of BRIN indexes for Postgres
*
* See src/backend/access/brin/README for details.
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/access/brin/brin.c
*
* TODO
* * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY)
*/
#include "postgres.h"
#include "access/brin.h"
#include "access/brin_internal.h"
#include "access/brin_page.h"
#include "access/brin_pageops.h"
#include "access/brin_xlog.h"
#include "access/reloptions.h"
#include "access/relscan.h"
#include "access/xact.h"
#include "access/xloginsert.h"
#include "catalog/index.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/bufmgr.h"
#include "storage/freespace.h"
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/snapmgr.h"
/*
* We use a BrinBuildState during initial construction of a BRIN index.
* The running state is kept in a BrinMemTuple.
*/
typedef struct BrinBuildState
{
Relation bs_irel;
int bs_numtuples;
Buffer bs_currentInsertBuf;
BlockNumber bs_pagesPerRange;
BlockNumber bs_currRangeStart;
BrinRevmap *bs_rmAccess;
BrinDesc *bs_bdesc;
BrinMemTuple *bs_dtuple;
} BrinBuildState;
/*
* Struct used as "opaque" during index scans
*/
typedef struct BrinOpaque
{
BlockNumber bo_pagesPerRange;
BrinRevmap *bo_rmAccess;
BrinDesc *bo_bdesc;
} BrinOpaque;
PG_FUNCTION_INFO_V1(brin_summarize_new_values);
static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
BrinRevmap *revmap, BlockNumber pagesPerRange);
static void terminate_brin_buildstate(BrinBuildState *state);
static void brinsummarize(Relation index, Relation heapRel,
double *numSummarized, double *numExisting);
static void form_and_insert_tuple(BrinBuildState *state);
static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,
BrinTuple *b);
/*
* A tuple in the heap is being inserted. To keep a brin index up to date,
* we need to obtain the relevant index tuple and compare its stored values
* with those of the new tuple. If the tuple values are not consistent with
* the summary tuple, we need to update the index tuple.
*
* If the range is not currently summarized (i.e. the revmap returns NULL for
* it), there's nothing to do.
*/
Datum
brininsert(PG_FUNCTION_ARGS)
{
Relation idxRel = (Relation) PG_GETARG_POINTER(0);
Datum *values = (Datum *) PG_GETARG_POINTER(1);
bool *nulls = (bool *) PG_GETARG_POINTER(2);
ItemPointer heaptid = (ItemPointer) PG_GETARG_POINTER(3);
/* we ignore the rest of our arguments */
BlockNumber pagesPerRange;
BrinDesc *bdesc = NULL;
BrinRevmap *revmap;
Buffer buf = InvalidBuffer;
MemoryContext tupcxt = NULL;
MemoryContext oldcxt = NULL;
revmap = brinRevmapInitialize(idxRel, &pagesPerRange);
for (;;)
{
bool need_insert = false;
OffsetNumber off;
BrinTuple *brtup;
BrinMemTuple *dtup;
BlockNumber heapBlk;
int keyno;
BrinTuple *tmptup PG_USED_FOR_ASSERTS_ONLY;
BrinMemTuple *tmpdtup PG_USED_FOR_ASSERTS_ONLY;
Size tmpsiz PG_USED_FOR_ASSERTS_ONLY;
CHECK_FOR_INTERRUPTS();
heapBlk = ItemPointerGetBlockNumber(heaptid);
/* normalize the block number to be the first block in the range */
heapBlk = (heapBlk / pagesPerRange) * pagesPerRange;
brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL,
BUFFER_LOCK_SHARE);
/* if range is unsummarized, there's nothing to do */
if (!brtup)
break;
/* First time through? */
if (bdesc == NULL)
{
bdesc = brin_build_desc(idxRel);
tupcxt = AllocSetContextCreate(CurrentMemoryContext,
"brininsert cxt",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
oldcxt = MemoryContextSwitchTo(tupcxt);
}
dtup = brin_deform_tuple(bdesc, brtup);
#ifdef USE_ASSERT_CHECKING
{
/*
* When assertions are enabled, we use this as an opportunity to
* test the "union" method, which would otherwise be used very
* rarely: first create a placeholder tuple, and addValue the
* value we just got into it. Then union the existing index tuple
* with the updated placeholder tuple. The tuple resulting from
* that union should be identical to the one resulting from the
* regular operation (straight addValue) below.
*
* Here we create the tuple to compare with; the actual comparison
* is below.
*/
tmptup = brin_form_placeholder_tuple(bdesc, heapBlk, &tmpsiz);
tmpdtup = brin_deform_tuple(bdesc, tmptup);
for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
{
BrinValues *bval;
FmgrInfo *addValue;
bval = &tmpdtup->bt_columns[keyno];
addValue = index_getprocinfo(idxRel, keyno + 1,
BRIN_PROCNUM_ADDVALUE);
FunctionCall4Coll(addValue,
idxRel->rd_indcollation[keyno],
PointerGetDatum(bdesc),
PointerGetDatum(bval),
values[keyno],
nulls[keyno]);
}
union_tuples(bdesc, tmpdtup, brtup);
tmpdtup->bt_placeholder = dtup->bt_placeholder;
tmptup = brin_form_tuple(bdesc, heapBlk, tmpdtup, &tmpsiz);
}
#endif
/*
* Compare the key values of the new tuple to the stored index values;
* our deformed tuple will get updated if the new tuple doesn't fit
* the original range (note this means we can't break out of the loop
* early). Make a note of whether this happens, so that we know to
* insert the modified tuple later.
*/
for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
{
Datum result;
BrinValues *bval;
FmgrInfo *addValue;
bval = &dtup->bt_columns[keyno];
addValue = index_getprocinfo(idxRel, keyno + 1,
BRIN_PROCNUM_ADDVALUE);
result = FunctionCall4Coll(addValue,
idxRel->rd_indcollation[keyno],
PointerGetDatum(bdesc),
PointerGetDatum(bval),
values[keyno],
nulls[keyno]);
/* if that returned true, we need to insert the updated tuple */
need_insert |= DatumGetBool(result);
}
#ifdef USE_ASSERT_CHECKING
{
/*
* Now we can compare the tuple produced by the union function
* with the one from plain addValue.
*/
BrinTuple *cmptup;
Size cmpsz;
cmptup = brin_form_tuple(bdesc, heapBlk, dtup, &cmpsz);
Assert(brin_tuples_equal(tmptup, tmpsiz, cmptup, cmpsz));
}
#endif
if (!need_insert)
{
/*
* The tuple is consistent with the new values, so there's nothing
* to do.
*/
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
else
{
Page page = BufferGetPage(buf);
ItemId lp = PageGetItemId(page, off);
Size origsz;
BrinTuple *origtup;
Size newsz;
BrinTuple *newtup;
bool samepage;
/*
* Make a copy of the old tuple, so that we can compare it after
* re-acquiring the lock.
*/
origsz = ItemIdGetLength(lp);
origtup = brin_copy_tuple(brtup, origsz);
/*
* Before releasing the lock, check if we can attempt a same-page
* update. Another process could insert a tuple concurrently in
* the same page though, so downstream we must be prepared to cope
* if this turns out to not be possible after all.
*/
samepage = brin_can_do_samepage_update(buf, origsz, newsz);
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz);
/*
* Try to update the tuple. If this doesn't work for whatever
* reason, we need to restart from the top; the revmap might be
* pointing at a different tuple for this block now, so we need to
* recompute to ensure both our new heap tuple and the other
* inserter's are covered by the combined tuple. It might be that
* we don't need to update at all.
*/
if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk,
buf, off, origtup, origsz, newtup, newsz,
samepage))
{
/* no luck; start over */
MemoryContextResetAndDeleteChildren(tupcxt);
continue;
}
}
/* success! */
break;
}
brinRevmapTerminate(revmap);
if (BufferIsValid(buf))
ReleaseBuffer(buf);
if (bdesc != NULL)
{
brin_free_desc(bdesc);
MemoryContextSwitchTo(oldcxt);
MemoryContextDelete(tupcxt);
}
return BoolGetDatum(false);
}
/*
* Initialize state for a BRIN index scan.
*
* We read the metapage here to determine the pages-per-range number that this
* index was built with. Note that since this cannot be changed while we're
* holding lock on index, it's not necessary to recompute it during brinrescan.
*/
Datum
brinbeginscan(PG_FUNCTION_ARGS)
{
Relation r = (Relation) PG_GETARG_POINTER(0);
int nkeys = PG_GETARG_INT32(1);
int norderbys = PG_GETARG_INT32(2);
IndexScanDesc scan;
BrinOpaque *opaque;
scan = RelationGetIndexScan(r, nkeys, norderbys);
opaque = (BrinOpaque *) palloc(sizeof(BrinOpaque));
opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange);
opaque->bo_bdesc = brin_build_desc(r);
scan->opaque = opaque;
PG_RETURN_POINTER(scan);
}
/*
* Execute the index scan.
*
* This works by reading index TIDs from the revmap, and obtaining the index
* tuples pointed to by them; the summary values in the index tuples are
* compared to the scan keys. We return into the TID bitmap all the pages in
* ranges corresponding to index tuples that match the scan keys.
*
* If a TID from the revmap is read as InvalidTID, we know that range is
* unsummarized. Pages in those ranges need to be returned regardless of scan
* keys.
*
* XXX see _bt_first on what to do about sk_subtype.
*/
Datum
bringetbitmap(PG_FUNCTION_ARGS)
{
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
Relation idxRel = scan->indexRelation;
Buffer buf = InvalidBuffer;
BrinDesc *bdesc;
Oid heapOid;
Relation heapRel;
BrinOpaque *opaque;
BlockNumber nblocks;
BlockNumber heapBlk;
int totalpages = 0;
int keyno;
FmgrInfo *consistentFn;
MemoryContext oldcxt;
MemoryContext perRangeCxt;
opaque = (BrinOpaque *) scan->opaque;
bdesc = opaque->bo_bdesc;
pgstat_count_index_scan(idxRel);
/*
* We need to know the size of the table so that we know how long to
* iterate on the revmap.
*/
heapOid = IndexGetRelation(RelationGetRelid(idxRel), false);
heapRel = heap_open(heapOid, AccessShareLock);
nblocks = RelationGetNumberOfBlocks(heapRel);
heap_close(heapRel, AccessShareLock);
/*
* Obtain consistent functions for all indexed column. Maybe it'd be
* possible to do this lazily only the first time we see a scan key that
* involves each particular attribute.
*/
consistentFn = palloc(sizeof(FmgrInfo) * bdesc->bd_tupdesc->natts);
for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
{
FmgrInfo *tmp;
tmp = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_CONSISTENT);
fmgr_info_copy(&consistentFn[keyno], tmp, CurrentMemoryContext);
}
/*
* Setup and use a per-range memory context, which is reset every time we
* loop below. This avoids having to free the tuples within the loop.
*/
perRangeCxt = AllocSetContextCreate(CurrentMemoryContext,
"bringetbitmap cxt",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
oldcxt = MemoryContextSwitchTo(perRangeCxt);
/*
* Now scan the revmap. We start by querying for heap page 0,
* incrementing by the number of pages per range; this gives us a full
* view of the table.
*/
for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange)
{
bool addrange;
BrinTuple *tup;
OffsetNumber off;
Size size;
CHECK_FOR_INTERRUPTS();
MemoryContextResetAndDeleteChildren(perRangeCxt);
tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf,
&off, &size, BUFFER_LOCK_SHARE);
if (tup)
{
tup = brin_copy_tuple(tup, size);
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
/*
* For page ranges with no indexed tuple, we must return the whole
* range; otherwise, compare it to the scan keys.
*/
if (tup == NULL)
{
addrange = true;
}
else
{
BrinMemTuple *dtup;
int keyno;
dtup = brin_deform_tuple(bdesc, tup);
if (dtup->bt_placeholder)
{
/*
* Placeholder tuples are always returned, regardless of the
* values stored in them.
*/
addrange = true;
}
else
{
/*
* Compare scan keys with summary values stored for the range.
* If scan keys are matched, the page range must be added to
* the bitmap. We initially assume the range needs to be
* added; in particular this serves the case where there are
* no keys.
*/
addrange = true;
for (keyno = 0; keyno < scan->numberOfKeys; keyno++)
{
ScanKey key = &scan->keyData[keyno];
AttrNumber keyattno = key->sk_attno;
BrinValues *bval = &dtup->bt_columns[keyattno - 1];
Datum add;
/*
* The collation of the scan key must match the collation
* used in the index column (but only if the search is not
* IS NULL/ IS NOT NULL). Otherwise we shouldn't be using
* this index ...
*/
Assert((key->sk_flags & SK_ISNULL) ||
(key->sk_collation ==
bdesc->bd_tupdesc->attrs[keyattno - 1]->attcollation));
/*
* Check whether the scan key is consistent with the page
* range values; if so, have the pages in the range added
* to the output bitmap.
*
* When there are multiple scan keys, failure to meet the
* criteria for a single one of them is enough to discard
* the range as a whole, so break out of the loop as soon
* as a false return value is obtained.
*/
add = FunctionCall3Coll(&consistentFn[keyattno - 1],
key->sk_collation,
PointerGetDatum(bdesc),
PointerGetDatum(bval),
PointerGetDatum(key));
addrange = DatumGetBool(add);
if (!addrange)
break;
}
}
}
/* add the pages in the range to the output bitmap, if needed */
if (addrange)
{
BlockNumber pageno;
for (pageno = heapBlk;
pageno <= heapBlk + opaque->bo_pagesPerRange - 1;
pageno++)
{
MemoryContextSwitchTo(oldcxt);
tbm_add_page(tbm, pageno);
totalpages++;
MemoryContextSwitchTo(perRangeCxt);
}
}
}
MemoryContextSwitchTo(oldcxt);
MemoryContextDelete(perRangeCxt);
if (buf != InvalidBuffer)
ReleaseBuffer(buf);
/*
* XXX We have an approximation of the number of *pages* that our scan
* returns, but we don't have a precise idea of the number of heap tuples
* involved.
*/
PG_RETURN_INT64(totalpages * 10);
}
/*
* Re-initialize state for a BRIN index scan
*/
Datum
brinrescan(PG_FUNCTION_ARGS)
{
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1);
/* other arguments ignored */
if (scankey && scan->numberOfKeys > 0)
memmove(scan->keyData, scankey,
scan->numberOfKeys * sizeof(ScanKeyData));
PG_RETURN_VOID();
}
/*
* Close down a BRIN index scan
*/
Datum
brinendscan(PG_FUNCTION_ARGS)
{
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
brinRevmapTerminate(opaque->bo_rmAccess);
brin_free_desc(opaque->bo_bdesc);
pfree(opaque);
PG_RETURN_VOID();
}
Datum
brinmarkpos(PG_FUNCTION_ARGS)
{
elog(ERROR, "BRIN does not support mark/restore");
PG_RETURN_VOID();
}
Datum
brinrestrpos(PG_FUNCTION_ARGS)
{
elog(ERROR, "BRIN does not support mark/restore");
PG_RETURN_VOID();
}
/*
* Per-heap-tuple callback for IndexBuildHeapScan.
*
* Note we don't worry about the page range at the end of the table here; it is
* present in the build state struct after we're called the last time, but not
* inserted into the index. Caller must ensure to do so, if appropriate.
*/
static void
brinbuildCallback(Relation index,
HeapTuple htup,
Datum *values,
bool *isnull,
bool tupleIsAlive,
void *brstate)
{
BrinBuildState *state = (BrinBuildState *) brstate;
BlockNumber thisblock;
int i;
thisblock = ItemPointerGetBlockNumber(&htup->t_self);
/*
* If we're in a block that belongs to a future range, summarize what we've
* got and start afresh. Note the scan might have skipped many pages,
* if they were devoid of live tuples; make sure to insert index tuples
* for those too.
*/
while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
{
BRIN_elog(DEBUG2, "brinbuildCallback: completed a range: %u--%u",
state->bs_currRangeStart,
state->bs_currRangeStart + state->bs_pagesPerRange);
/* create the index tuple and insert it */
form_and_insert_tuple(state);
/* set state to correspond to the next range */
state->bs_currRangeStart += state->bs_pagesPerRange;
/* re-initialize state for it */
brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
}
/* Accumulate the current tuple into the running state */
for (i = 0; i < state->bs_bdesc->bd_tupdesc->natts; i++)
{
FmgrInfo *addValue;
BrinValues *col;
col = &state->bs_dtuple->bt_columns[i];
addValue = index_getprocinfo(index, i + 1,
BRIN_PROCNUM_ADDVALUE);
/*
* Update dtuple state, if and as necessary.
*/
FunctionCall4Coll(addValue,
state->bs_bdesc->bd_tupdesc->attrs[i]->attcollation,
PointerGetDatum(state->bs_bdesc),
PointerGetDatum(col),
values[i], isnull[i]);
}
}
/*
* brinbuild() -- build a new BRIN index.
*/
Datum
brinbuild(PG_FUNCTION_ARGS)
{
Relation heap = (Relation) PG_GETARG_POINTER(0);
Relation index = (Relation) PG_GETARG_POINTER(1);
IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
IndexBuildResult *result;
double reltuples;
double idxtuples;
BrinRevmap *revmap;
BrinBuildState *state;
Buffer meta;
BlockNumber pagesPerRange;
/*
* We expect to be called exactly once for any index relation.
*/
if (RelationGetNumberOfBlocks(index) != 0)
elog(ERROR, "index \"%s\" already contains data",
RelationGetRelationName(index));
/*
* Critical section not required, because on error the creation of the
* whole relation will be rolled back.
*/
meta = ReadBuffer(index, P_NEW);
Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO);
LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE);
brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index),
BRIN_CURRENT_VERSION);
MarkBufferDirty(meta);
if (RelationNeedsWAL(index))
{
xl_brin_createidx xlrec;
XLogRecPtr recptr;
XLogRecData rdata;
Page page;
xlrec.node = index->rd_node;
xlrec.version = BRIN_CURRENT_VERSION;
xlrec.pagesPerRange = BrinGetPagesPerRange(index);
rdata.buffer = InvalidBuffer;
rdata.data = (char *) &xlrec;
rdata.len = SizeOfBrinCreateIdx;
rdata.next = NULL;
recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX, &rdata);
page = BufferGetPage(meta);
PageSetLSN(page, recptr);
}
UnlockReleaseBuffer(meta);
/*
* Initialize our state, including the deformed tuple state.
*/
revmap = brinRevmapInitialize(index, &pagesPerRange);
state = initialize_brin_buildstate(index, revmap, pagesPerRange);
/*
* Now scan the relation. No syncscan allowed here because we want the
* heap blocks in physical order.
*/
reltuples = IndexBuildHeapScan(heap, index, indexInfo, false,
brinbuildCallback, (void *) state);
/* process the final batch */
form_and_insert_tuple(state);
/* release resources */
idxtuples = state->bs_numtuples;
brinRevmapTerminate(state->bs_rmAccess);
terminate_brin_buildstate(state);
/*
* Return statistics
*/
result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
result->heap_tuples = reltuples;
result->index_tuples = idxtuples;
PG_RETURN_POINTER(result);
}
Datum
brinbuildempty(PG_FUNCTION_ARGS)
{
Relation index = (Relation) PG_GETARG_POINTER(0);
Buffer metabuf;
/* An empty BRIN index has a metapage only. */
metabuf =
ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
/* Initialize and xlog metabuffer. */
START_CRIT_SECTION();
brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index),
BRIN_CURRENT_VERSION);
MarkBufferDirty(metabuf);
log_newpage_buffer(metabuf, false);
END_CRIT_SECTION();
UnlockReleaseBuffer(metabuf);
PG_RETURN_VOID();
}
/*
* brinbulkdelete
* Since there are no per-heap-tuple index tuples in BRIN indexes,
* there's not a lot we can do here.
*
* XXX we could mark item tuples as "dirty" (when a minimum or maximum heap
* tuple is deleted), meaning the need to re-run summarization on the affected
* range. Need to an extra flag in mmtuples for that.
*/
Datum
brinbulkdelete(PG_FUNCTION_ARGS)
{
/* other arguments are not currently used */
IndexBulkDeleteResult *stats =
(IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
/* allocate stats if first time through, else re-use existing struct */
if (stats == NULL)
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
PG_RETURN_POINTER(stats);
}
/*
* This routine is in charge of "vacuuming" a BRIN index: we just summarize
* ranges that are currently unsummarized.
*/
Datum
brinvacuumcleanup(PG_FUNCTION_ARGS)
{
IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
IndexBulkDeleteResult *stats =
(IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
Relation heapRel;
/* No-op in ANALYZE ONLY mode */
if (info->analyze_only)
PG_RETURN_POINTER(stats);
if (!stats)
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
stats->num_pages = RelationGetNumberOfBlocks(info->index);
/* rest of stats is initialized by zeroing */
heapRel = heap_open(IndexGetRelation(RelationGetRelid(info->index), false),
AccessShareLock);
brinsummarize(info->index, heapRel,
&stats->num_index_tuples, &stats->num_index_tuples);
heap_close(heapRel, AccessShareLock);
PG_RETURN_POINTER(stats);
}
/*
* reloptions processor for BRIN indexes
*/
Datum
brinoptions(PG_FUNCTION_ARGS)
{
Datum reloptions = PG_GETARG_DATUM(0);
bool validate = PG_GETARG_BOOL(1);
relopt_value *options;
BrinOptions *rdopts;
int numoptions;
static const relopt_parse_elt tab[] = {
{"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)}
};
options = parseRelOptions(reloptions, validate, RELOPT_KIND_BRIN,
&numoptions);
/* if none set, we're done */
if (numoptions == 0)
PG_RETURN_NULL();
rdopts = allocateReloptStruct(sizeof(BrinOptions), options, numoptions);
fillRelOptions((void *) rdopts, sizeof(BrinOptions), options, numoptions,
validate, tab, lengthof(tab));
pfree(options);
PG_RETURN_BYTEA_P(rdopts);
}
/*
* SQL-callable function to scan through an index and summarize all ranges
* that are not currently summarized.
*/
Datum
brin_summarize_new_values(PG_FUNCTION_ARGS)
{
Oid indexoid = PG_GETARG_OID(0);
Relation indexRel;
Relation heapRel;
double numSummarized = 0;
heapRel = heap_open(IndexGetRelation(indexoid, false),
ShareUpdateExclusiveLock);
indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
brinsummarize(indexRel, heapRel, &numSummarized, NULL);
relation_close(indexRel, ShareUpdateExclusiveLock);
relation_close(heapRel, ShareUpdateExclusiveLock);
PG_RETURN_INT32((int32) numSummarized);
}
/*
* Build a BrinDesc used to create or scan a BRIN index
*/
BrinDesc *
brin_build_desc(Relation rel)
{
BrinOpcInfo **opcinfo;
BrinDesc *bdesc;
TupleDesc tupdesc;
int totalstored = 0;
int keyno;
long totalsize;
MemoryContext cxt;
MemoryContext oldcxt;
cxt = AllocSetContextCreate(CurrentMemoryContext,
"brin desc cxt",
ALLOCSET_SMALL_INITSIZE,
ALLOCSET_SMALL_MINSIZE,
ALLOCSET_SMALL_MAXSIZE);
oldcxt = MemoryContextSwitchTo(cxt);
tupdesc = RelationGetDescr(rel);
/*
* Obtain BrinOpcInfo for each indexed column. While at it, accumulate
* the number of columns stored, since the number is opclass-defined.
*/
opcinfo = (BrinOpcInfo **) palloc(sizeof(BrinOpcInfo *) * tupdesc->natts);
for (keyno = 0; keyno < tupdesc->natts; keyno++)
{
FmgrInfo *opcInfoFn;
opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO);
opcinfo[keyno] = (BrinOpcInfo *)
DatumGetPointer(FunctionCall1(opcInfoFn,
tupdesc->attrs[keyno]->atttypid));
totalstored += opcinfo[keyno]->oi_nstored;
}
/* Allocate our result struct and fill it in */
totalsize = offsetof(BrinDesc, bd_info) +
sizeof(BrinOpcInfo *) * tupdesc->natts;
bdesc = palloc(totalsize);
bdesc->bd_context = cxt;
bdesc->bd_index = rel;
bdesc->bd_tupdesc = tupdesc;
bdesc->bd_disktdesc = NULL; /* generated lazily */
bdesc->bd_totalstored = totalstored;
for (keyno = 0; keyno < tupdesc->natts; keyno++)
bdesc->bd_info[keyno] = opcinfo[keyno];
pfree(opcinfo);
MemoryContextSwitchTo(oldcxt);
return bdesc;
}
void
brin_free_desc(BrinDesc *bdesc)
{
/* make sure the tupdesc is still valid */
Assert(bdesc->bd_tupdesc->tdrefcount >= 1);
/* no need for retail pfree */
MemoryContextDelete(bdesc->bd_context);
}
/*
* Initialize a BrinBuildState appropriate to create tuples on the given index.
*/
static BrinBuildState *
initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap,
BlockNumber pagesPerRange)
{
BrinBuildState *state;
state = palloc(sizeof(BrinBuildState));
state->bs_irel = idxRel;
state->bs_numtuples = 0;
state->bs_currentInsertBuf = InvalidBuffer;
state->bs_pagesPerRange = pagesPerRange;
state->bs_currRangeStart = 0;
state->bs_rmAccess = revmap;
state->bs_bdesc = brin_build_desc(idxRel);
state->bs_dtuple = brin_new_memtuple(state->bs_bdesc);
brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
return state;
}
/*
* Release resources associated with a BrinBuildState.
*/
static void
terminate_brin_buildstate(BrinBuildState *state)
{
/* release the last index buffer used */
if (!BufferIsInvalid(state->bs_currentInsertBuf))
{
Page page;
page = BufferGetPage(state->bs_currentInsertBuf);
RecordPageWithFreeSpace(state->bs_irel,
BufferGetBlockNumber(state->bs_currentInsertBuf),
PageGetFreeSpace(page));
ReleaseBuffer(state->bs_currentInsertBuf);
}
brin_free_desc(state->bs_bdesc);
pfree(state->bs_dtuple);
pfree(state);
}
/*
* Summarize the given page range of the given index.
*
* This routine can run in parallel with insertions into the heap. To avoid
* missing those values from the summary tuple, we first insert a placeholder
* index tuple into the index, then execute the heap scan; transactions
* concurrent with the scan update the placeholder tuple. After the scan, we
* union the placeholder tuple with the one computed by this routine. The
* update of the index value happens in a loop, so that if somebody updates
* the placeholder tuple after we read it, we detect the case and try again.
* This ensures that the concurrently inserted tuples are not lost.
*/
static void
summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel,
BlockNumber heapBlk)
{
Buffer phbuf;
BrinTuple *phtup;
Size phsz;
OffsetNumber offset;
/*
* Insert the placeholder tuple
*/
phbuf = InvalidBuffer;
phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz);
offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange,
state->bs_rmAccess, &phbuf,
heapBlk, phtup, phsz);
/*
* Execute the partial heap scan covering the heap blocks in the specified
* page range, summarizing the heap tuples in it. This scan stops just
* short of brinbuildCallback creating the new index entry.
*/
state->bs_currRangeStart = heapBlk;
IndexBuildHeapRangeScan(heapRel, state->bs_irel, indexInfo, false,
heapBlk, state->bs_pagesPerRange,
brinbuildCallback, (void *) state);
/*
* Now we update the values obtained by the scan with the placeholder
* tuple. We do this in a loop which only terminates if we're able to
* update the placeholder tuple successfully; if we are not, this means
* somebody else modified the placeholder tuple after we read it.
*/
for (;;)
{
BrinTuple *newtup;
Size newsize;
bool didupdate;
bool samepage;
CHECK_FOR_INTERRUPTS();
/*
* Update the summary tuple and try to update.
*/
newtup = brin_form_tuple(state->bs_bdesc,
heapBlk, state->bs_dtuple, &newsize);
samepage = brin_can_do_samepage_update(phbuf, phsz, newsize);
didupdate =
brin_doupdate(state->bs_irel, state->bs_pagesPerRange,
state->bs_rmAccess, heapBlk, phbuf, offset,
phtup, phsz, newtup, newsize, samepage);
brin_free_tuple(phtup);
brin_free_tuple(newtup);
/* If the update succeeded, we're done. */
if (didupdate)
break;
/*
* If the update didn't work, it might be because somebody updated the
* placeholder tuple concurrently. Extract the new version, union it
* with the values we have from the scan, and start over. (There are
* other reasons for the update to fail, but it's simple to treat them
* the same.)
*/
phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf,
&offset, &phsz, BUFFER_LOCK_SHARE);
/* the placeholder tuple must exist */
if (phtup == NULL)
elog(ERROR, "missing placeholder tuple");
phtup = brin_copy_tuple(phtup, phsz);
LockBuffer(phbuf, BUFFER_LOCK_UNLOCK);
/* merge it into the tuple from the heap scan */
union_tuples(state->bs_bdesc, state->bs_dtuple, phtup);
}
ReleaseBuffer(phbuf);
}
/*
* Scan a complete BRIN index, and summarize each page range that's not already
* summarized. The index and heap must have been locked by caller in at
* least ShareUpdateExclusiveLock mode.
*
* For each new index tuple inserted, *numSummarized (if not NULL) is
* incremented; for each existing tuple, numExisting (if not NULL) is
* incremented.
*/
static void
brinsummarize(Relation index, Relation heapRel, double *numSummarized,
double *numExisting)
{
BrinRevmap *revmap;
BrinBuildState *state = NULL;
IndexInfo *indexInfo = NULL;
BlockNumber heapNumBlocks;
BlockNumber heapBlk;
BlockNumber pagesPerRange;
Buffer buf;
revmap = brinRevmapInitialize(index, &pagesPerRange);
/*
* Scan the revmap to find unsummarized items.
*/
buf = InvalidBuffer;
heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
for (heapBlk = 0; heapBlk < heapNumBlocks; heapBlk += pagesPerRange)
{
BrinTuple *tup;
OffsetNumber off;
CHECK_FOR_INTERRUPTS();
tup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL,
BUFFER_LOCK_SHARE);
if (tup == NULL)
{
/* no revmap entry for this heap range. Summarize it. */
if (state == NULL)
{
/* first time through */
Assert(!indexInfo);
state = initialize_brin_buildstate(index, revmap,
pagesPerRange);
indexInfo = BuildIndexInfo(index);
/*
* We only have ShareUpdateExclusiveLock on the table, and
* therefore other sessions may insert tuples into the range
* we're going to scan. This is okay, because we take
* additional precautions to avoid losing the additional
* tuples; see comments in summarize_range. Set the
* concurrent flag, which causes IndexBuildHeapRangeScan to
* use a snapshot other than SnapshotAny, and silences
* warnings emitted there.
*/
indexInfo->ii_Concurrent = true;
/*
* If using transaction-snapshot mode, it would be possible
* for another transaction to insert a tuple that's not
* visible to our snapshot if we have already acquired one,
* when in snapshot-isolation mode; therefore, disallow this
* from running in such a transaction unless a snapshot hasn't
* been acquired yet.
*
* This code is called by VACUUM and
* brin_summarize_new_values. Have the error message mention
* the latter because VACUUM cannot run in a transaction and
* thus cannot cause this issue.
*/
if (IsolationUsesXactSnapshot() && FirstSnapshotSet)
ereport(ERROR,
(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
errmsg("brin_summarize_new_values() cannot run in a transaction that has already obtained a snapshot")));
}
summarize_range(indexInfo, state, heapRel, heapBlk);
/* and re-initialize state for the next range */
brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
if (numSummarized)
*numSummarized += 1.0;
}
else
{
if (numExisting)
*numExisting += 1.0;
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
}
if (BufferIsValid(buf))
ReleaseBuffer(buf);
/* free resources */
brinRevmapTerminate(revmap);
if (state)
terminate_brin_buildstate(state);
}
/*
* Given a deformed tuple in the build state, convert it into the on-disk
* format and insert it into the index, making the revmap point to it.
*/
static void
form_and_insert_tuple(BrinBuildState *state)
{
BrinTuple *tup;
Size size;
tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
state->bs_dtuple, &size);
brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
&state->bs_currentInsertBuf, state->bs_currRangeStart,
tup, size);
state->bs_numtuples++;
pfree(tup);
}
/*
* Given two deformed tuples, adjust the first one so that it's consistent
* with the summary values in both.
*/
static void
union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
{
int keyno;
BrinMemTuple *db;
MemoryContext cxt;
MemoryContext oldcxt;
/* Use our own memory context to avoid retail pfree */
cxt = AllocSetContextCreate(CurrentMemoryContext,
"brin union",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
oldcxt = MemoryContextSwitchTo(cxt);
db = brin_deform_tuple(bdesc, b);
MemoryContextSwitchTo(oldcxt);
for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
{
FmgrInfo *unionFn;
BrinValues *col_a = &a->bt_columns[keyno];
BrinValues *col_b = &db->bt_columns[keyno];
unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1,
BRIN_PROCNUM_UNION);
FunctionCall3Coll(unionFn,
bdesc->bd_index->rd_indcollation[keyno],
PointerGetDatum(bdesc),
PointerGetDatum(col_a),
PointerGetDatum(col_b));
}
MemoryContextDelete(cxt);
}
/*
* brin_minmax.c
* Implementation of Min/Max opclass for BRIN
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/access/brin/brin_minmax.c
*/
#include "postgres.h"
#include "access/genam.h"
#include "access/brin_internal.h"
#include "access/brin_tuple.h"
#include "access/skey.h"
#include "catalog/pg_type.h"
#include "utils/datum.h"
#include "utils/lsyscache.h"
#include "utils/syscache.h"
/*
* Procedure numbers must not collide with BRIN_PROCNUM defines in
* brin_internal.h. Note we only need inequality functions.
*/
#define MINMAX_NUM_PROCNUMS 4 /* # support procs we need */
#define PROCNUM_LESS 11
#define PROCNUM_LESSEQUAL 12
#define PROCNUM_GREATEREQUAL 13
#define PROCNUM_GREATER 14
/*
* Subtract this from procnum to obtain index in MinmaxOpaque arrays
* (Must be equal to minimum of private procnums)
*/
#define PROCNUM_BASE 11
static FmgrInfo *minmax_get_procinfo(BrinDesc *bdesc, uint16 attno,
uint16 procnum);
PG_FUNCTION_INFO_V1(minmaxOpcInfo);
PG_FUNCTION_INFO_V1(minmaxAddValue);
PG_FUNCTION_INFO_V1(minmaxConsistent);
PG_FUNCTION_INFO_V1(minmaxUnion);
typedef struct MinmaxOpaque
{
FmgrInfo operators[MINMAX_NUM_PROCNUMS];
bool inited[MINMAX_NUM_PROCNUMS];
} MinmaxOpaque;
Datum
minmaxOpcInfo(PG_FUNCTION_ARGS)
{
Oid typoid = PG_GETARG_OID(0);
BrinOpcInfo *result;
/*
* opaque->operators is initialized lazily, as indicated by 'inited' which
* is initialized to all false by palloc0.
*/
result = palloc0(MAXALIGN(SizeofBrinOpcInfo(2)) +
sizeof(MinmaxOpaque));
result->oi_nstored = 2;
result->oi_opaque = (MinmaxOpaque *)
MAXALIGN((char *) result + SizeofBrinOpcInfo(2));
result->oi_typids[0] = typoid;
result->oi_typids[1] = typoid;
PG_RETURN_POINTER(result);
}
/*
* Examine the given index tuple (which contains partial status of a certain
* page range) by comparing it to the given value that comes from another heap
* tuple. If the new value is outside the min/max range specified by the
* existing tuple values, update the index tuple and return true. Otherwise,
* return false and do not modify in this case.
*/
Datum
minmaxAddValue(PG_FUNCTION_ARGS)
{
BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
Datum newval = PG_GETARG_DATUM(2);
bool isnull = PG_GETARG_DATUM(3);
Oid colloid = PG_GET_COLLATION();
FmgrInfo *cmpFn;
Datum compar;
bool updated = false;
Form_pg_attribute attr;
AttrNumber attno;
/*
* If the new value is null, we record that we saw it if it's the first
* one; otherwise, there's nothing to do.
*/
if (isnull)
{
if (column->bv_hasnulls)
PG_RETURN_BOOL(false);
column->bv_hasnulls = true;
PG_RETURN_BOOL(true);
}
attno = column->bv_attno;
attr = bdesc->bd_tupdesc->attrs[attno - 1];
/*
* If the recorded value is null, store the new value (which we know to be
* not null) as both minimum and maximum, and we're done.
*/
if (column->bv_allnulls)
{
column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen);
column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen);
column->bv_allnulls = false;
PG_RETURN_BOOL(true);
}
/*
* Otherwise, need to compare the new value with the existing boundaries
* and update them accordingly. First check if it's less than the
* existing minimum.
*/
cmpFn = minmax_get_procinfo(bdesc, attno, PROCNUM_LESS);
compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[0]);
if (DatumGetBool(compar))
{
if (!attr->attbyval)
pfree(DatumGetPointer(column->bv_values[0]));
column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen);
updated = true;
}
/*
* And now compare it to the existing maximum.
*/
cmpFn = minmax_get_procinfo(bdesc, attno, PROCNUM_GREATER);
compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[1]);
if (DatumGetBool(compar))
{
if (!attr->attbyval)
pfree(DatumGetPointer(column->bv_values[1]));
column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen);
updated = true;
}
PG_RETURN_BOOL(updated);
}
/*
* Given an index tuple corresponding to a certain page range and a scan key,
* return whether the scan key is consistent with the index tuple's min/max
* values. Return true if so, false otherwise.
*/
Datum
minmaxConsistent(PG_FUNCTION_ARGS)
{
BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
ScanKey key = (ScanKey) PG_GETARG_POINTER(2);
Oid colloid = PG_GET_COLLATION();
AttrNumber attno;
Datum value;
Datum matches;
Assert(key->sk_attno == column->bv_attno);
/* handle IS NULL/IS NOT NULL tests */
if (key->sk_flags & SK_ISNULL)
{
if (key->sk_flags & SK_SEARCHNULL)
{
if (column->bv_allnulls || column->bv_hasnulls)
PG_RETURN_BOOL(true);
PG_RETURN_BOOL(false);
}
/*
* For IS NOT NULL, we can only skip ranges that are known to have
* only nulls.
*/
Assert(key->sk_flags & SK_SEARCHNOTNULL);
PG_RETURN_BOOL(!column->bv_allnulls);
}
/* if the range is all empty, it cannot possibly be consistent */
if (column->bv_allnulls)
PG_RETURN_BOOL(false);
attno = key->sk_attno;
value = key->sk_argument;
switch (key->sk_strategy)
{
case BTLessStrategyNumber:
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
PROCNUM_LESS),
colloid, column->bv_values[0], value);
break;
case BTLessEqualStrategyNumber:
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
PROCNUM_LESSEQUAL),
colloid, column->bv_values[0], value);
break;
case BTEqualStrategyNumber:
/*
* In the equality case (WHERE col = someval), we want to return
* the current page range if the minimum value in the range <=
* scan key, and the maximum value >= scan key.
*/
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
PROCNUM_LESSEQUAL),
colloid, column->bv_values[0], value);
if (!DatumGetBool(matches))
break;
/* max() >= scankey */
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
PROCNUM_GREATEREQUAL),
colloid, column->bv_values[1], value);
break;
case BTGreaterEqualStrategyNumber:
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
PROCNUM_GREATEREQUAL),
colloid, column->bv_values[1], value);
break;
case BTGreaterStrategyNumber:
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
PROCNUM_GREATER),
colloid, column->bv_values[1], value);
break;
default:
/* shouldn't happen */
elog(ERROR, "invalid strategy number %d", key->sk_strategy);
matches = 0;
break;
}
PG_RETURN_DATUM(matches);
}
/*
* Given two BrinValues, update the first of them as a union of the summary
* values contained in both. The second one is untouched.
*/
Datum
minmaxUnion(PG_FUNCTION_ARGS)
{
BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1);
BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2);
Oid colloid = PG_GET_COLLATION();
AttrNumber attno;
Form_pg_attribute attr;
bool needsadj;
Assert(col_a->bv_attno == col_b->bv_attno);
/* If there are no values in B, there's nothing to do */
if (col_b->bv_allnulls)
PG_RETURN_VOID();
attno = col_a->bv_attno;
attr = bdesc->bd_tupdesc->attrs[attno - 1];
/* Adjust "hasnulls" */
if (col_b->bv_hasnulls && !col_a->bv_hasnulls)
col_a->bv_hasnulls = true;
/*
* Adjust "allnulls". If B has values but A doesn't, just copy the values
* from B into A, and we're done. (We cannot run the operators in this
* case, because values in A might contain garbage.)
*/
if (!col_b->bv_allnulls && col_a->bv_allnulls)
{
col_a->bv_allnulls = false;
col_a->bv_values[0] = datumCopy(col_b->bv_values[0],
attr->attbyval, attr->attlen);
col_a->bv_values[1] = datumCopy(col_b->bv_values[1],
attr->attbyval, attr->attlen);
PG_RETURN_VOID();
}
/* Adjust minimum, if B's min is less than A's min */
needsadj = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
PROCNUM_LESS),
colloid, col_b->bv_values[0], col_a->bv_values[0]);
if (needsadj)
{
if (!attr->attbyval)
pfree(DatumGetPointer(col_a->bv_values[0]));
col_a->bv_values[0] = datumCopy(col_b->bv_values[0],
attr->attbyval, attr->attlen);
}
/* Adjust maximum, if B's max is greater than A's max */
needsadj = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
PROCNUM_GREATER),
colloid, col_b->bv_values[1], col_a->bv_values[1]);
if (needsadj)
{
if (!attr->attbyval)
pfree(DatumGetPointer(col_a->bv_values[1]));
col_a->bv_values[1] = datumCopy(col_b->bv_values[1],
attr->attbyval, attr->attlen);
}
PG_RETURN_VOID();
}
/*
* Return the procedure corresponding to the given function support number.
*/
static FmgrInfo *
minmax_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum)
{
MinmaxOpaque *opaque;
uint16 basenum = procnum - PROCNUM_BASE;
opaque = (MinmaxOpaque *) bdesc->bd_info[attno - 1]->oi_opaque;
/*
* We cache these in the opaque struct, to avoid repetitive syscache
* lookups.
*/
if (!opaque->inited[basenum])
{
fmgr_info_copy(&opaque->operators[basenum],
index_getprocinfo(bdesc->bd_index, attno, procnum),
bdesc->bd_context);
opaque->inited[basenum] = true;
}
return &opaque->operators[basenum];
}
/*
* brin_pageops.c
* Page-handling routines for BRIN indexes
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/access/brin/brin_pageops.c
*/
#include "postgres.h"
#include "access/brin_pageops.h"
#include "access/brin_page.h"
#include "access/brin_revmap.h"
#include "access/brin_xlog.h"
#include "access/xloginsert.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/freespace.h"
#include "storage/lmgr.h"
#include "storage/smgr.h"
#include "utils/rel.h"
static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
bool *was_extended);
static Size br_page_get_freespace(Page page);
/*
* Update tuple origtup (size origsz), located in offset oldoff of buffer
* oldbuf, to newtup (size newsz) as summary tuple for the page range starting
* at heapBlk. oldbuf must not be locked on entry, and is not locked at exit.
*
* If samepage is true, attempt to put the new tuple in the same page, but if
* there's no room, use some other one.
*
* If the update is successful, return true; the revmap is updated to point to
* the new tuple. If the update is not done for whatever reason, return false.
* Caller may retry the update if this happens.
*/
bool
brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
BrinRevmap *revmap, BlockNumber heapBlk,
Buffer oldbuf, OffsetNumber oldoff,
const BrinTuple *origtup, Size origsz,
const BrinTuple *newtup, Size newsz,
bool samepage)
{
Page oldpage;
ItemId oldlp;
BrinTuple *oldtup;
Size oldsz;
Buffer newbuf;
BrinSpecialSpace *special;
bool extended = false;
newsz = MAXALIGN(newsz);
/* make sure the revmap is long enough to contain the entry we need */
brinRevmapExtend(revmap, heapBlk);
if (!samepage)
{
/* need a page on which to put the item */
newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended);
/* XXX delay vacuuming FSM until locks are released? */
if (extended)
FreeSpaceMapVacuum(idxrel);
if (!BufferIsValid(newbuf))
return false;
/*
* Note: it's possible (though unlikely) that the returned newbuf is
* the same as oldbuf, if brin_getinsertbuffer determined that the old
* buffer does in fact have enough space.
*/
if (newbuf == oldbuf)
newbuf = InvalidBuffer;
}
else
{
LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
newbuf = InvalidBuffer;
}
oldpage = BufferGetPage(oldbuf);
oldlp = PageGetItemId(oldpage, oldoff);
/*
* Check that the old tuple wasn't updated concurrently: it might have
* moved someplace else entirely ...
*/
if (!ItemIdIsNormal(oldlp))
{
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
if (BufferIsValid(newbuf))
UnlockReleaseBuffer(newbuf);
return false;
}
oldsz = ItemIdGetLength(oldlp);
oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp);
/*
* ... or it might have been updated in place to different contents.
*/
if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz))
{
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
if (BufferIsValid(newbuf))
UnlockReleaseBuffer(newbuf);
return false;
}
special = (BrinSpecialSpace *) PageGetSpecialPointer(oldpage);
/*
* Great, the old tuple is intact. We can proceed with the update.
*
* If there's enough room in the old page for the new tuple, replace it.
*
* Note that there might now be enough space on the page even though the
* caller told us there isn't, if a concurrent update moved another tuple
* elsewhere or replaced a tuple with a smaller one.
*/
if (((special->flags & BRIN_EVACUATE_PAGE) == 0) &&
brin_can_do_samepage_update(oldbuf, origsz, newsz))
{
if (BufferIsValid(newbuf))
UnlockReleaseBuffer(newbuf);
START_CRIT_SECTION();
PageIndexDeleteNoCompact(oldpage, &oldoff, 1);
if (PageAddItem(oldpage, (Item) newtup, newsz, oldoff, true,
false) == InvalidOffsetNumber)
elog(ERROR, "failed to add BRIN tuple");
MarkBufferDirty(oldbuf);
/* XLOG stuff */
if (RelationNeedsWAL(idxrel))
{
BlockNumber blk = BufferGetBlockNumber(oldbuf);
xl_brin_samepage_update xlrec;
XLogRecPtr recptr;
XLogRecData rdata[2];
uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE;
xlrec.node = idxrel->rd_node;
ItemPointerSetBlockNumber(&xlrec.tid, blk);
ItemPointerSetOffsetNumber(&xlrec.tid, oldoff);
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfBrinSamepageUpdate;
rdata[0].buffer = InvalidBuffer;
rdata[0].next = &(rdata[1]);
rdata[1].data = (char *) newtup;
rdata[1].len = newsz;
rdata[1].buffer = oldbuf;
rdata[1].buffer_std = true;
rdata[1].next = NULL;
recptr = XLogInsert(RM_BRIN_ID, info, rdata);
PageSetLSN(oldpage, recptr);
}
END_CRIT_SECTION();
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
return true;
}
else if (newbuf == InvalidBuffer)
{
/*
* Not enough space, but caller said that there was. Tell them to
* start over.
*/
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
return false;
}
else
{
/*
* Not enough free space on the oldpage. Put the new tuple on the new
* page, and update the revmap.
*/
Page newpage = BufferGetPage(newbuf);
Buffer revmapbuf;
ItemPointerData newtid;
OffsetNumber newoff;
revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
START_CRIT_SECTION();
PageIndexDeleteNoCompact(oldpage, &oldoff, 1);
newoff = PageAddItem(newpage, (Item) newtup, newsz,
InvalidOffsetNumber, false, false);
if (newoff == InvalidOffsetNumber)
elog(ERROR, "failed to add BRIN tuple to new page");
MarkBufferDirty(oldbuf);
MarkBufferDirty(newbuf);
ItemPointerSet(&newtid, BufferGetBlockNumber(newbuf), newoff);
brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid);
MarkBufferDirty(revmapbuf);
/* XLOG stuff */
if (RelationNeedsWAL(idxrel))
{
xl_brin_update xlrec;
XLogRecPtr recptr;
XLogRecData rdata[4];
uint8 info;
info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0);
xlrec.new.node = idxrel->rd_node;
ItemPointerSet(&xlrec.new.tid, BufferGetBlockNumber(newbuf), newoff);
xlrec.new.heapBlk = heapBlk;
xlrec.new.tuplen = newsz;
xlrec.new.revmapBlk = BufferGetBlockNumber(revmapbuf);
xlrec.new.pagesPerRange = pagesPerRange;
ItemPointerSet(&xlrec.oldtid, BufferGetBlockNumber(oldbuf), oldoff);
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfBrinUpdate;
rdata[0].buffer = InvalidBuffer;
rdata[0].next = &(rdata[1]);
rdata[1].data = (char *) newtup;
rdata[1].len = newsz;
rdata[1].buffer = extended ? InvalidBuffer : newbuf;
rdata[1].buffer_std = true;
rdata[1].next = &(rdata[2]);
rdata[2].data = (char *) NULL;
rdata[2].len = 0;
rdata[2].buffer = revmapbuf;
rdata[2].buffer_std = true;
rdata[2].next = &(rdata[3]);
rdata[3].data = (char *) NULL;
rdata[3].len = 0;
rdata[3].buffer = oldbuf;
rdata[3].buffer_std = true;
rdata[3].next = NULL;
recptr = XLogInsert(RM_BRIN_ID, info, rdata);
PageSetLSN(oldpage, recptr);
PageSetLSN(newpage, recptr);
PageSetLSN(BufferGetPage(revmapbuf), recptr);
}
END_CRIT_SECTION();
LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
UnlockReleaseBuffer(newbuf);
return true;
}
}
/*
* Return whether brin_doupdate can do a samepage update.
*/
bool
brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz)
{
return
((newsz <= origsz) ||
PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz));
}
/*
* Insert an index tuple into the index relation. The revmap is updated to
* mark the range containing the given page as pointing to the inserted entry.
* A WAL record is written.
*
* The buffer, if valid, is first checked for free space to insert the new
* entry; if there isn't enough, a new buffer is obtained and pinned. No
* buffer lock must be held on entry, no buffer lock is held on exit.
*
* Return value is the offset number where the tuple was inserted.
*/
OffsetNumber
brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk,
BrinTuple *tup, Size itemsz)
{
Page page;
BlockNumber blk;
OffsetNumber off;
Buffer revmapbuf;
ItemPointerData tid;
bool extended = false;
itemsz = MAXALIGN(itemsz);
/* Make sure the revmap is long enough to contain the entry we need */
brinRevmapExtend(revmap, heapBlk);
/*
* Obtain a locked buffer to insert the new tuple. Note
* brin_getinsertbuffer ensures there's enough space in the returned
* buffer.
*/
if (BufferIsValid(*buffer))
{
/*
* It's possible that another backend (or ourselves!) extended the
* revmap over the page we held a pin on, so we cannot assume that
* it's still a regular page.
*/
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz)
{
UnlockReleaseBuffer(*buffer);
*buffer = InvalidBuffer;
}
}
if (!BufferIsValid(*buffer))
{
*buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended);
Assert(BufferIsValid(*buffer));
Assert(br_page_get_freespace(BufferGetPage(*buffer)) >= itemsz);
}
/* Now obtain lock on revmap buffer */
revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
page = BufferGetPage(*buffer);
blk = BufferGetBlockNumber(*buffer);
START_CRIT_SECTION();
off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber,
false, false);
if (off == InvalidOffsetNumber)
elog(ERROR, "could not insert new index tuple to page");
MarkBufferDirty(*buffer);
BRIN_elog(DEBUG2, "inserted tuple (%u,%u) for range starting at %u",
blk, off, heapBlk);
ItemPointerSet(&tid, blk, off);
brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid);
MarkBufferDirty(revmapbuf);
/* XLOG stuff */
if (RelationNeedsWAL(idxrel))
{
xl_brin_insert xlrec;
XLogRecPtr recptr;
XLogRecData rdata[3];
uint8 info;
info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0);
xlrec.node = idxrel->rd_node;
xlrec.heapBlk = heapBlk;
xlrec.pagesPerRange = pagesPerRange;
xlrec.revmapBlk = BufferGetBlockNumber(revmapbuf);
xlrec.tuplen = itemsz;
ItemPointerSet(&xlrec.tid, blk, off);
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfBrinInsert;
rdata[0].buffer = InvalidBuffer;
rdata[0].buffer_std = false;
rdata[0].next = &(rdata[1]);
rdata[1].data = (char *) tup;
rdata[1].len = itemsz;
rdata[1].buffer = extended ? InvalidBuffer : *buffer;
rdata[1].buffer_std = true;
rdata[1].next = &(rdata[2]);
rdata[2].data = (char *) NULL;
rdata[2].len = 0;
rdata[2].buffer = revmapbuf;
rdata[2].buffer_std = false;
rdata[2].next = NULL;
recptr = XLogInsert(RM_BRIN_ID, info, rdata);
PageSetLSN(page, recptr);
PageSetLSN(BufferGetPage(revmapbuf), recptr);
}
END_CRIT_SECTION();
/* Tuple is firmly on buffer; we can release our locks */
LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
if (extended)
FreeSpaceMapVacuum(idxrel);
return off;
}
/*
* Initialize a page with the given type.
*
* Caller is responsible for marking it dirty, as appropriate.
*/
void
brin_page_init(Page page, uint16 type)
{
BrinSpecialSpace *special;
PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace));
special = (BrinSpecialSpace *) PageGetSpecialPointer(page);
special->type = type;
}
/*
* Initialize a new BRIN index' metapage.
*/
void
brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
{
BrinMetaPageData *metadata;
brin_page_init(page, BRIN_PAGETYPE_META);
metadata = (BrinMetaPageData *) PageGetContents(page);
metadata->brinMagic = BRIN_META_MAGIC;
metadata->brinVersion = version;
metadata->pagesPerRange = pagesPerRange;
/*
* Note we cheat here a little. 0 is not a valid revmap block number
* (because it's the metapage buffer), but doing this enables the first
* revmap page to be created when the index is.
*/
metadata->lastRevmapPage = 0;
}
/*
* Initiate page evacuation protocol.
*
* The page must be locked in exclusive mode by the caller.
*
* If the page is not yet initialized or empty, return false without doing
* anything; it can be used for revmap without any further changes. If it
* contains tuples, mark it for evacuation and return true.
*/
bool
brin_start_evacuating_page(Relation idxRel, Buffer buf)
{
OffsetNumber off;
OffsetNumber maxoff;
BrinSpecialSpace *special;
Page page;
page = BufferGetPage(buf);
if (PageIsNew(page))
return false;
special = (BrinSpecialSpace *) PageGetSpecialPointer(page);
maxoff = PageGetMaxOffsetNumber(page);
for (off = FirstOffsetNumber; off <= maxoff; off++)
{
ItemId lp;
lp = PageGetItemId(page, off);
if (ItemIdIsUsed(lp))
{
/* prevent other backends from adding more stuff to this page */
special->flags |= BRIN_EVACUATE_PAGE;
MarkBufferDirtyHint(buf, true);
return true;
}
}
return false;
}
/*
* Move all tuples out of a page.
*
* The caller must hold lock on the page. The lock and pin are released.
*/
void
brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
BrinRevmap *revmap, Buffer buf)
{
OffsetNumber off;
OffsetNumber maxoff;
Page page;
page = BufferGetPage(buf);
Assert(((BrinSpecialSpace *)
PageGetSpecialPointer(page))->flags & BRIN_EVACUATE_PAGE);
maxoff = PageGetMaxOffsetNumber(page);
for (off = FirstOffsetNumber; off <= maxoff; off++)
{
BrinTuple *tup;
Size sz;
ItemId lp;
CHECK_FOR_INTERRUPTS();
lp = PageGetItemId(page, off);
if (ItemIdIsUsed(lp))
{
sz = ItemIdGetLength(lp);
tup = (BrinTuple *) PageGetItem(page, lp);
tup = brin_copy_tuple(tup, sz);
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno,
buf, off, tup, sz, tup, sz, false))
off--; /* retry */
LockBuffer(buf, BUFFER_LOCK_SHARE);
/* It's possible that someone extended the revmap over this page */
if (!BRIN_IS_REGULAR_PAGE(page))
break;
}
}
UnlockReleaseBuffer(buf);
}
/*
* Return a pinned and exclusively locked buffer which can be used to insert an
* index item of size itemsz. If oldbuf is a valid buffer, it is also locked
* (in a order determined to avoid deadlocks.)
*
* If there's no existing page with enough free space to accomodate the new
* item, the relation is extended. If this happens, *extended is set to true.
*
* If we find that the old page is no longer a regular index page (because
* of a revmap extension), the old buffer is unlocked and we return
* InvalidBuffer.
*/
static Buffer
brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
bool *was_extended)
{
BlockNumber oldblk;
BlockNumber newblk;
Page page;
int freespace;
if (BufferIsValid(oldbuf))
oldblk = BufferGetBlockNumber(oldbuf);
else
oldblk = InvalidBlockNumber;
/*
* Loop until we find a page with sufficient free space. By the time we
* return to caller out of this loop, both buffers are valid and locked;
* if we have to restart here, neither buffer is locked and buf is not a
* pinned buffer.
*/
newblk = RelationGetTargetBlock(irel);
if (newblk == InvalidBlockNumber)
newblk = GetPageWithFreeSpace(irel, itemsz);
for (;;)
{
Buffer buf;
bool extensionLockHeld = false;
bool extended = false;
CHECK_FOR_INTERRUPTS();
if (newblk == InvalidBlockNumber)
{
/*
* There's not enough free space in any existing index page,
* according to the FSM: extend the relation to obtain a shiny new
* page.
*/
if (!RELATION_IS_LOCAL(irel))
{
LockRelationForExtension(irel, ExclusiveLock);
extensionLockHeld = true;
}
buf = ReadBuffer(irel, P_NEW);
newblk = BufferGetBlockNumber(buf);
*was_extended = extended = true;
BRIN_elog(DEBUG2, "brin_getinsertbuffer: extending to page %u",
BufferGetBlockNumber(buf));
}
else if (newblk == oldblk)
{
/*
* There's an odd corner-case here where the FSM is out-of-date,
* and gave us the old page.
*/
buf = oldbuf;
}
else
{
buf = ReadBuffer(irel, newblk);
}
/*
* We lock the old buffer first, if it's earlier than the new one; but
* before we do, we need to check that it hasn't been turned into a
* revmap page concurrently; if we detect that it happened, give up
* and tell caller to start over.
*/
if (BufferIsValid(oldbuf) && oldblk < newblk)
{
LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)))
{
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
ReleaseBuffer(buf);
return InvalidBuffer;
}
}
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
if (extensionLockHeld)
UnlockRelationForExtension(irel, ExclusiveLock);
page = BufferGetPage(buf);
if (extended)
brin_page_init(page, BRIN_PAGETYPE_REGULAR);
/*
* We have a new buffer to insert into. Check that the new page has
* enough free space, and return it if it does; otherwise start over.
* Note that we allow for the FSM to be out of date here, and in that
* case we update it and move on.
*
* (br_page_get_freespace also checks that the FSM didn't hand us a
* page that has since been repurposed for the revmap.)
*/
freespace = br_page_get_freespace(page);
if (freespace >= itemsz)
{
RelationSetTargetBlock(irel, BufferGetBlockNumber(buf));
/*
* Since the target block specification can get lost on cache
* invalidations, make sure we update the more permanent FSM with
* data about it before going away.
*/
if (extended)
RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf),
freespace);
/*
* Lock the old buffer if not locked already. Note that in this
* case we know for sure it's a regular page: it's later than the
* new page we just got, which is not a revmap page, and revmap
* pages are always consecutive.
*/
if (BufferIsValid(oldbuf) && oldblk > newblk)
{
LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)));
}
return buf;
}
/* This page is no good. */
/*
* If an entirely new page does not contain enough free space for the
* new item, then surely that item is oversized. Complain loudly; but
* first make sure we record the page as free, for next time.
*/
if (extended)
{
RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf),
freespace);
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
(unsigned long) itemsz,
(unsigned long) freespace,
RelationGetRelationName(irel))));
return InvalidBuffer; /* keep compiler quiet */
}
if (newblk != oldblk)
UnlockReleaseBuffer(buf);
if (BufferIsValid(oldbuf) && oldblk <= newblk)
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz);
}
}
/*
* Return the amount of free space on a regular BRIN index page.
*
* If the page is not a regular page, or has been marked with the
* BRIN_EVACUATE_PAGE flag, returns 0.
*/
static Size
br_page_get_freespace(Page page)
{
BrinSpecialSpace *special;
special = (BrinSpecialSpace *) PageGetSpecialPointer(page);
if (!BRIN_IS_REGULAR_PAGE(page) ||
(special->flags & BRIN_EVACUATE_PAGE) != 0)
return 0;
else
return PageGetFreeSpace(page);
}
/*
* brin_revmap.c
* Range map for BRIN indexes
*
* The range map (revmap) is a translation structure for BRIN indexes: for each
* page range there is one summary tuple, and its location is tracked by the
* revmap. Whenever a new tuple is inserted into a table that violates the
* previously recorded summary values, a new tuple is inserted into the index
* and the revmap is updated to point to it.
*
* The revmap is stored in the first pages of the index, immediately following
* the metapage. When the revmap needs to be expanded, all tuples on the
* regular BRIN page at that block (if any) are moved out of the way.
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/access/brin/brin_revmap.c
*/
#include "postgres.h"
#include "access/brin_page.h"
#include "access/brin_pageops.h"
#include "access/brin_revmap.h"
#include "access/brin_tuple.h"
#include "access/brin_xlog.h"
#include "access/rmgr.h"
#include "access/xloginsert.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/lmgr.h"
#include "utils/rel.h"
/*
* In revmap pages, each item stores an ItemPointerData. These defines let one
* find the logical revmap page number and index number of the revmap item for
* the given heap block number.
*/
#define HEAPBLK_TO_REVMAP_BLK(pagesPerRange, heapBlk) \
((heapBlk / pagesPerRange) / REVMAP_PAGE_MAXITEMS)
#define HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk) \
((heapBlk / pagesPerRange) % REVMAP_PAGE_MAXITEMS)
struct BrinRevmap
{
Relation rm_irel;
BlockNumber rm_pagesPerRange;
BlockNumber rm_lastRevmapPage; /* cached from the metapage */
Buffer rm_metaBuf;
Buffer rm_currBuf;
};
/* typedef appears in brin_revmap.h */
static BlockNumber revmap_get_blkno(BrinRevmap *revmap,
BlockNumber heapBlk);
static Buffer revmap_get_buffer(BrinRevmap *revmap, BlockNumber heapBlk);
static BlockNumber revmap_extend_and_get_blkno(BrinRevmap *revmap,
BlockNumber heapBlk);
static void revmap_physical_extend(BrinRevmap *revmap);
/*
* Initialize an access object for a range map. This must be freed by
* brinRevmapTerminate when caller is done with it.
*/
BrinRevmap *
brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange)
{
BrinRevmap *revmap;
Buffer meta;
BrinMetaPageData *metadata;
meta = ReadBuffer(idxrel, BRIN_METAPAGE_BLKNO);
LockBuffer(meta, BUFFER_LOCK_SHARE);
metadata = (BrinMetaPageData *) PageGetContents(BufferGetPage(meta));
revmap = palloc(sizeof(BrinRevmap));
revmap->rm_irel = idxrel;
revmap->rm_pagesPerRange = metadata->pagesPerRange;
revmap->rm_lastRevmapPage = metadata->lastRevmapPage;
revmap->rm_metaBuf = meta;
revmap->rm_currBuf = InvalidBuffer;
*pagesPerRange = metadata->pagesPerRange;
LockBuffer(meta, BUFFER_LOCK_UNLOCK);
return revmap;
}
/*
* Release resources associated with a revmap access object.
*/
void
brinRevmapTerminate(BrinRevmap *revmap)
{
ReleaseBuffer(revmap->rm_metaBuf);
if (revmap->rm_currBuf != InvalidBuffer)
ReleaseBuffer(revmap->rm_currBuf);
pfree(revmap);
}
/*
* Extend the revmap to cover the given heap block number.
*/
void
brinRevmapExtend(BrinRevmap *revmap, BlockNumber heapBlk)
{
BlockNumber mapBlk;
mapBlk = revmap_extend_and_get_blkno(revmap, heapBlk);
/* Ensure the buffer we got is in the expected range */
Assert(mapBlk != InvalidBlockNumber &&
mapBlk != BRIN_METAPAGE_BLKNO &&
mapBlk <= revmap->rm_lastRevmapPage);
}
/*
* Prepare to insert an entry into the revmap; the revmap buffer in which the
* entry is to reside is locked and returned. Most callers should call
* brinRevmapExtend beforehand, as this routine does not extend the revmap if
* it's not long enough.
*
* The returned buffer is also recorded in the revmap struct; finishing that
* releases the buffer, therefore the caller needn't do it explicitely.
*/
Buffer
brinLockRevmapPageForUpdate(BrinRevmap *revmap, BlockNumber heapBlk)
{
Buffer rmBuf;
rmBuf = revmap_get_buffer(revmap, heapBlk);
LockBuffer(rmBuf, BUFFER_LOCK_EXCLUSIVE);
return rmBuf;
}
/*
* In the given revmap buffer (locked appropriately by caller), which is used
* in a BRIN index of pagesPerRange pages per range, set the element
* corresponding to heap block number heapBlk to the given TID.
*
* Once the operation is complete, the caller must update the LSN on the
* returned buffer.
*
* This is used both in regular operation and during WAL replay.
*/
void
brinSetHeapBlockItemptr(Buffer buf, BlockNumber pagesPerRange,
BlockNumber heapBlk, ItemPointerData tid)
{
RevmapContents *contents;
ItemPointerData *iptr;
Page page;
/* The correct page should already be pinned and locked */
page = BufferGetPage(buf);
contents = (RevmapContents *) PageGetContents(page);
iptr = (ItemPointerData *) contents->rm_tids;
iptr += HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk);
ItemPointerSet(iptr,
ItemPointerGetBlockNumber(&tid),
ItemPointerGetOffsetNumber(&tid));
}
/*
* Fetch the BrinTuple for a given heap block.
*
* The buffer containing the tuple is locked, and returned in *buf. As an
* optimization, the caller can pass a pinned buffer *buf on entry, which will
* avoid a pin-unpin cycle when the next tuple is on the same page as a
* previous one.
*
* If no tuple is found for the given heap range, returns NULL. In that case,
* *buf might still be updated, but it's not locked.
*
* The output tuple offset within the buffer is returned in *off, and its size
* is returned in *size.
*/
BrinTuple *
brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk,
Buffer *buf, OffsetNumber *off, Size *size, int mode)
{
Relation idxRel = revmap->rm_irel;
BlockNumber mapBlk;
RevmapContents *contents;
ItemPointerData *iptr;
BlockNumber blk;
Page page;
ItemId lp;
BrinTuple *tup;
ItemPointerData previptr;
/* normalize the heap block number to be the first page in the range */
heapBlk = (heapBlk / revmap->rm_pagesPerRange) * revmap->rm_pagesPerRange;
/* Compute the revmap page number we need */
mapBlk = revmap_get_blkno(revmap, heapBlk);
if (mapBlk == InvalidBlockNumber)
{
*off = InvalidOffsetNumber;
return NULL;
}
ItemPointerSetInvalid(&previptr);
for (;;)
{
CHECK_FOR_INTERRUPTS();
if (revmap->rm_currBuf == InvalidBuffer ||
BufferGetBlockNumber(revmap->rm_currBuf) != mapBlk)
{
if (revmap->rm_currBuf != InvalidBuffer)
ReleaseBuffer(revmap->rm_currBuf);
Assert(mapBlk != InvalidBlockNumber);
revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk);
}
LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_SHARE);
contents = (RevmapContents *)
PageGetContents(BufferGetPage(revmap->rm_currBuf));
iptr = contents->rm_tids;
iptr += HEAPBLK_TO_REVMAP_INDEX(revmap->rm_pagesPerRange, heapBlk);
if (!ItemPointerIsValid(iptr))
{
LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK);
return NULL;
}
/*
* Check the TID we got in a previous iteration, if any, and save the
* current TID we got from the revmap; if we loop, we can sanity-check
* that the next one we get is different. Otherwise we might be stuck
* looping forever if the revmap is somehow badly broken.
*/
if (ItemPointerIsValid(&previptr) && ItemPointerEquals(&previptr, iptr))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("corrupted BRIN index: inconsistent range map")));
previptr = *iptr;
blk = ItemPointerGetBlockNumber(iptr);
*off = ItemPointerGetOffsetNumber(iptr);
LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK);
/* Ok, got a pointer to where the BrinTuple should be. Fetch it. */
if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != blk)
{
if (BufferIsValid(*buf))
ReleaseBuffer(*buf);
*buf = ReadBuffer(idxRel, blk);
}
LockBuffer(*buf, mode);
page = BufferGetPage(*buf);
/* If we land on a revmap page, start over */
if (BRIN_IS_REGULAR_PAGE(page))
{
lp = PageGetItemId(page, *off);
if (ItemIdIsUsed(lp))
{
tup = (BrinTuple *) PageGetItem(page, lp);
if (tup->bt_blkno == heapBlk)
{
if (size)
*size = ItemIdGetLength(lp);
/* found it! */
return tup;
}
}
}
/*
* No luck. Assume that the revmap was updated concurrently.
*/
LockBuffer(*buf, BUFFER_LOCK_UNLOCK);
}
/* not reached, but keep compiler quiet */
return NULL;
}
/*
* Given a heap block number, find the corresponding physical revmap block
* number and return it. If the revmap page hasn't been allocated yet, return
* InvalidBlockNumber.
*/
static BlockNumber
revmap_get_blkno(BrinRevmap *revmap, BlockNumber heapBlk)
{
BlockNumber targetblk;
/* obtain revmap block number, skip 1 for metapage block */
targetblk = HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk) + 1;
/* Normal case: the revmap page is already allocated */
if (targetblk <= revmap->rm_lastRevmapPage)
return targetblk;
return InvalidBlockNumber;
}
/*
* Obtain and return a buffer containing the revmap page for the given heap
* page. The revmap must have been previously extended to cover that page.
* The returned buffer is also recorded in the revmap struct; finishing that
* releases the buffer, therefore the caller needn't do it explicitely.
*/
static Buffer
revmap_get_buffer(BrinRevmap *revmap, BlockNumber heapBlk)
{
BlockNumber mapBlk;
/* Translate the heap block number to physical index location. */
mapBlk = revmap_get_blkno(revmap, heapBlk);
if (mapBlk == InvalidBlockNumber)
elog(ERROR, "revmap does not cover heap block %u", heapBlk);
/* Ensure the buffer we got is in the expected range */
Assert(mapBlk != BRIN_METAPAGE_BLKNO &&
mapBlk <= revmap->rm_lastRevmapPage);
BRIN_elog(DEBUG2, "getting revmap page for logical page %lu (physical %u) for heap %u",
HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk),
mapBlk, heapBlk);
/*
* Obtain the buffer from which we need to read. If we already have the
* correct buffer in our access struct, use that; otherwise, release that,
* (if valid) and read the one we need.
*/
if (revmap->rm_currBuf == InvalidBuffer ||
mapBlk != BufferGetBlockNumber(revmap->rm_currBuf))
{
if (revmap->rm_currBuf != InvalidBuffer)
ReleaseBuffer(revmap->rm_currBuf);
revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk);
}
return revmap->rm_currBuf;
}
/*
* Given a heap block number, find the corresponding physical revmap block
* number and return it. If the revmap page hasn't been allocated yet, extend
* the revmap until it is.
*/
static BlockNumber
revmap_extend_and_get_blkno(BrinRevmap *revmap, BlockNumber heapBlk)
{
BlockNumber targetblk;
/* obtain revmap block number, skip 1 for metapage block */
targetblk = HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk) + 1;
/* Extend the revmap, if necessary */
while (targetblk > revmap->rm_lastRevmapPage)
{
CHECK_FOR_INTERRUPTS();
revmap_physical_extend(revmap);
}
return targetblk;
}
/*
* Try to extend the revmap by one page. This might not happen for a number of
* reasons; caller is expected to retry until the expected outcome is obtained.
*/
static void
revmap_physical_extend(BrinRevmap *revmap)
{
Buffer buf;
Page page;
Page metapage;
BrinMetaPageData *metadata;
BlockNumber mapBlk;
BlockNumber nblocks;
Relation irel = revmap->rm_irel;
bool needLock = !RELATION_IS_LOCAL(irel);
/*
* Lock the metapage. This locks out concurrent extensions of the revmap,
* but note that we still need to grab the relation extension lock because
* another backend can extend the index with regular BRIN pages.
*/
LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_EXCLUSIVE);
metapage = BufferGetPage(revmap->rm_metaBuf);
metadata = (BrinMetaPageData *) PageGetContents(metapage);
/*
* Check that our cached lastRevmapPage value was up-to-date; if it
* wasn't, update the cached copy and have caller start over.
*/
if (metadata->lastRevmapPage != revmap->rm_lastRevmapPage)
{
revmap->rm_lastRevmapPage = metadata->lastRevmapPage;
LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
return;
}
mapBlk = metadata->lastRevmapPage + 1;
nblocks = RelationGetNumberOfBlocks(irel);
if (mapBlk < nblocks)
{
buf = ReadBuffer(irel, mapBlk);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
page = BufferGetPage(buf);
}
else
{
if (needLock)
LockRelationForExtension(irel, ExclusiveLock);
buf = ReadBuffer(irel, P_NEW);
if (BufferGetBlockNumber(buf) != mapBlk)
{
/*
* Very rare corner case: somebody extended the relation
* concurrently after we read its length. If this happens, give
* up and have caller start over. We will have to evacuate that
* page from under whoever is using it.
*/
if (needLock)
UnlockRelationForExtension(irel, ExclusiveLock);
LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
return;
}
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
page = BufferGetPage(buf);
if (needLock)
UnlockRelationForExtension(irel, ExclusiveLock);
}
/* Check that it's a regular block (or an empty page) */
if (!PageIsNew(page) && !BRIN_IS_REGULAR_PAGE(page))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("unexpected page type 0x%04X in BRIN index \"%s\" block %u",
BRIN_PAGE_TYPE(page),
RelationGetRelationName(irel),
BufferGetBlockNumber(buf))));
/* If the page is in use, evacuate it and restart */
if (brin_start_evacuating_page(irel, buf))
{
LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
brin_evacuate_page(irel, revmap->rm_pagesPerRange, revmap, buf);
/* have caller start over */
return;
}
/*
* Ok, we have now locked the metapage and the target block. Re-initialize
* it as a revmap page.
*/
START_CRIT_SECTION();
/* the rm_tids array is initialized to all invalid by PageInit */
brin_page_init(page, BRIN_PAGETYPE_REVMAP);
MarkBufferDirty(buf);
metadata->lastRevmapPage = mapBlk;
MarkBufferDirty(revmap->rm_metaBuf);
if (RelationNeedsWAL(revmap->rm_irel))
{
xl_brin_revmap_extend xlrec;
XLogRecPtr recptr;
XLogRecData rdata[2];
xlrec.node = revmap->rm_irel->rd_node;
xlrec.targetBlk = mapBlk;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfBrinRevmapExtend;
rdata[0].buffer = InvalidBuffer;
rdata[0].buffer_std = false;
rdata[0].next = &(rdata[1]);
rdata[1].data = (char *) NULL;
rdata[1].len = 0;
rdata[1].buffer = revmap->rm_metaBuf;
rdata[1].buffer_std = false;
rdata[1].next = NULL;
recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_REVMAP_EXTEND, rdata);
PageSetLSN(metapage, recptr);
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
UnlockReleaseBuffer(buf);
}
/*
* brin_tuples.c
* Method implementations for tuples in BRIN indexes.
*
* Intended usage is that code outside this file only deals with
* BrinMemTuples, and convert to and from the on-disk representation through
* functions in this file.
*
* NOTES
*
* A BRIN tuple is similar to a heap tuple, with a few key differences. The
* first interesting difference is that the tuple header is much simpler, only
* containing its total length and a small area for flags. Also, the stored
* data does not match the relation tuple descriptor exactly: for each
* attribute in the descriptor, the index tuple carries an arbitrary number
* of values, depending on the opclass.
*
* Also, for each column of the index relation there are two null bits: one
* (hasnulls) stores whether any tuple within the page range has that column
* set to null; the other one (allnulls) stores whether the column values are
* all null. If allnulls is true, then the tuple data area does not contain
* values for that column at all; whereas it does if the hasnulls is set.
* Note the size of the null bitmask may not be the same as that of the
* datum array.
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/access/brin/brin_tuple.c
*/
#include "postgres.h"
#include "access/htup_details.h"
#include "access/brin_tuple.h"
#include "access/tupdesc.h"
#include "access/tupmacs.h"
#include "utils/datum.h"
#include "utils/memutils.h"
static inline void brin_deconstruct_tuple(BrinDesc *brdesc,
char *tp, bits8 *nullbits, bool nulls,
Datum *values, bool *allnulls, bool *hasnulls);
/*
* Return a tuple descriptor used for on-disk storage of BRIN tuples.
*/
static TupleDesc
brtuple_disk_tupdesc(BrinDesc *brdesc)
{
/* We cache these in the BrinDesc */
if (brdesc->bd_disktdesc == NULL)
{
int i;
int j;
AttrNumber attno = 1;
TupleDesc tupdesc;
MemoryContext oldcxt;
/* make sure it's in the bdesc's context */
oldcxt = MemoryContextSwitchTo(brdesc->bd_context);
tupdesc = CreateTemplateTupleDesc(brdesc->bd_totalstored, false);
for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
{
for (j = 0; j < brdesc->bd_info[i]->oi_nstored; j++)
TupleDescInitEntry(tupdesc, attno++, NULL,
brdesc->bd_info[i]->oi_typids[j],
-1, 0);
}
MemoryContextSwitchTo(oldcxt);
brdesc->bd_disktdesc = tupdesc;
}
return brdesc->bd_disktdesc;
}
/*
* Generate a new on-disk tuple to be inserted in a BRIN index.
*
* See brin_form_placeholder_tuple if you touch this.
*/
BrinTuple *
brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
Size *size)
{
Datum *values;
bool *nulls;
bool anynulls = false;
BrinTuple *rettuple;
int keyno;
int idxattno;
uint16 phony_infomask;
bits8 *phony_nullbitmap;
Size len,
hoff,
data_len;
Assert(brdesc->bd_totalstored > 0);
values = palloc(sizeof(Datum) * brdesc->bd_totalstored);
nulls = palloc0(sizeof(bool) * brdesc->bd_totalstored);
phony_nullbitmap = palloc(sizeof(bits8) * BITMAPLEN(brdesc->bd_totalstored));
/*
* Set up the values/nulls arrays for heap_fill_tuple
*/
idxattno = 0;
for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
{
int datumno;
/*
* "allnulls" is set when there's no nonnull value in any row in the
* column; when this happens, there is no data to store. Thus set the
* nullable bits for all data elements of this column and we're done.
*/
if (tuple->bt_columns[keyno].bv_allnulls)
{
for (datumno = 0;
datumno < brdesc->bd_info[keyno]->oi_nstored;
datumno++)
nulls[idxattno++] = true;
anynulls = true;
continue;
}
/*
* The "hasnulls" bit is set when there are some null values in the
* data. We still need to store a real value, but the presence of
* this means we need a null bitmap.
*/
if (tuple->bt_columns[keyno].bv_hasnulls)
anynulls = true;
for (datumno = 0;
datumno < brdesc->bd_info[keyno]->oi_nstored;
datumno++)
values[idxattno++] = tuple->bt_columns[keyno].bv_values[datumno];
}
/* compute total space needed */
len = SizeOfBrinTuple;
if (anynulls)
{
/*
* We need a double-length bitmap on an on-disk BRIN index tuple; the
* first half stores the "allnulls" bits, the second stores
* "hasnulls".
*/
len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2);
}
len = hoff = MAXALIGN(len);
data_len = heap_compute_data_size(brtuple_disk_tupdesc(brdesc),
values, nulls);
len += data_len;
rettuple = palloc0(len);
rettuple->bt_blkno = blkno;
rettuple->bt_info = hoff;
Assert((rettuple->bt_info & BRIN_OFFSET_MASK) == hoff);
/*
* The infomask and null bitmap as computed by heap_fill_tuple are useless
* to us. However, that function will not accept a null infomask; and we
* need to pass a valid null bitmap so that it will correctly skip
* outputting null attributes in the data area.
*/
heap_fill_tuple(brtuple_disk_tupdesc(brdesc),
values,
nulls,
(char *) rettuple + hoff,
data_len,
&phony_infomask,
phony_nullbitmap);
/* done with these */
pfree(values);
pfree(nulls);
pfree(phony_nullbitmap);
/*
* Now fill in the real null bitmasks. allnulls first.
*/
if (anynulls)
{
bits8 *bitP;
int bitmask;
rettuple->bt_info |= BRIN_NULLS_MASK;
/*
* Note that we reverse the sense of null bits in this module: we
* store a 1 for a null attribute rather than a 0. So we must reverse
* the sense of the att_isnull test in br_deconstruct_tuple as well.
*/
bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1;
bitmask = HIGHBIT;
for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
{
if (bitmask != HIGHBIT)
bitmask <<= 1;
else
{
bitP += 1;
*bitP = 0x0;
bitmask = 1;
}
if (!tuple->bt_columns[keyno].bv_allnulls)
continue;
*bitP |= bitmask;
}
/* hasnulls bits follow */
for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
{
if (bitmask != HIGHBIT)
bitmask <<= 1;
else
{
bitP += 1;
*bitP = 0x0;
bitmask = 1;
}
if (!tuple->bt_columns[keyno].bv_hasnulls)
continue;
*bitP |= bitmask;
}
bitP = ((bits8 *) (rettuple + SizeOfBrinTuple)) - 1;
}
if (tuple->bt_placeholder)
rettuple->bt_info |= BRIN_PLACEHOLDER_MASK;
*size = len;
return rettuple;
}
/*
* Generate a new on-disk tuple with no data values, marked as placeholder.
*
* This is a cut-down version of brin_form_tuple.
*/
BrinTuple *
brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size)
{
Size len;
Size hoff;
BrinTuple *rettuple;
int keyno;
bits8 *bitP;
int bitmask;
/* compute total space needed: always add nulls */
len = SizeOfBrinTuple;
len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2);
len = hoff = MAXALIGN(len);
rettuple = palloc0(len);
rettuple->bt_blkno = blkno;
rettuple->bt_info = hoff;
rettuple->bt_info |= BRIN_NULLS_MASK | BRIN_PLACEHOLDER_MASK;
bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1;
bitmask = HIGHBIT;
/* set allnulls true for all attributes */
for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
{
if (bitmask != HIGHBIT)
bitmask <<= 1;
else
{
bitP += 1;
*bitP = 0x0;
bitmask = 1;
}
*bitP |= bitmask;
}
/* no need to set hasnulls */
*size = len;
return rettuple;
}
/*
* Free a tuple created by brin_form_tuple
*/
void
brin_free_tuple(BrinTuple *tuple)
{
pfree(tuple);
}
/*
* Create an palloc'd copy of a BrinTuple.
*/
BrinTuple *
brin_copy_tuple(BrinTuple *tuple, Size len)
{
BrinTuple *newtup;
newtup = palloc(len);
memcpy(newtup, tuple, len);
return newtup;
}
/*
* Return whether two BrinTuples are bitwise identical.
*/
bool
brin_tuples_equal(const BrinTuple *a, Size alen, const BrinTuple *b, Size blen)
{
if (alen != blen)
return false;
if (memcmp(a, b, alen) != 0)
return false;
return true;
}
/*
* Create a new BrinMemTuple from scratch, and initialize it to an empty
* state.
*
* Note: we don't provide any means to free a deformed tuple, so make sure to
* use a temporary memory context.
*/
BrinMemTuple *
brin_new_memtuple(BrinDesc *brdesc)
{
BrinMemTuple *dtup;
char *currdatum;
long basesize;
int i;
basesize = MAXALIGN(sizeof(BrinMemTuple) +
sizeof(BrinValues) * brdesc->bd_tupdesc->natts);
dtup = palloc0(basesize + sizeof(Datum) * brdesc->bd_totalstored);
currdatum = (char *) dtup + basesize;
for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
{
dtup->bt_columns[i].bv_attno = i + 1;
dtup->bt_columns[i].bv_allnulls = true;
dtup->bt_columns[i].bv_hasnulls = false;
dtup->bt_columns[i].bv_values = (Datum *) currdatum;
currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored;
}
dtup->bt_context = AllocSetContextCreate(CurrentMemoryContext,
"brin dtuple",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
return dtup;
}
/*
* Reset a BrinMemTuple to initial state
*/
void
brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc)
{
int i;
MemoryContextReset(dtuple->bt_context);
for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
{
dtuple->bt_columns[i].bv_allnulls = true;
dtuple->bt_columns[i].bv_hasnulls = false;
}
}
/*
* Convert a BrinTuple back to a BrinMemTuple. This is the reverse of
* brin_form_tuple.
*
* Note we don't need the "on disk tupdesc" here; we rely on our own routine to
* deconstruct the tuple from the on-disk format.
*/
BrinMemTuple *
brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple)
{
BrinMemTuple *dtup;
Datum *values;
bool *allnulls;
bool *hasnulls;
char *tp;
bits8 *nullbits;
int keyno;
int valueno;
MemoryContext oldcxt;
dtup = brin_new_memtuple(brdesc);
if (BrinTupleIsPlaceholder(tuple))
dtup->bt_placeholder = true;
dtup->bt_blkno = tuple->bt_blkno;
values = palloc(sizeof(Datum) * brdesc->bd_totalstored);
allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
tp = (char *) tuple + BrinTupleDataOffset(tuple);
if (BrinTupleHasNulls(tuple))
nullbits = (bits8 *) ((char *) tuple + SizeOfBrinTuple);
else
nullbits = NULL;
brin_deconstruct_tuple(brdesc,
tp, nullbits, BrinTupleHasNulls(tuple),
values, allnulls, hasnulls);
/*
* Iterate to assign each of the values to the corresponding item in the
* values array of each column. The copies occur in the tuple's context.
*/
oldcxt = MemoryContextSwitchTo(dtup->bt_context);
for (valueno = 0, keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
{
int i;
if (allnulls[keyno])
{
valueno += brdesc->bd_info[keyno]->oi_nstored;
continue;
}
/*
* We would like to skip datumCopy'ing the values datum in some cases,
* caller permitting ...
*/
for (i = 0; i < brdesc->bd_info[keyno]->oi_nstored; i++)
dtup->bt_columns[keyno].bv_values[i] =
datumCopy(values[valueno++],
brdesc->bd_tupdesc->attrs[keyno]->attbyval,
brdesc->bd_tupdesc->attrs[keyno]->attlen);
dtup->bt_columns[keyno].bv_hasnulls = hasnulls[keyno];
dtup->bt_columns[keyno].bv_allnulls = false;
}
MemoryContextSwitchTo(oldcxt);
pfree(values);
pfree(allnulls);
pfree(hasnulls);
return dtup;
}
/*
* brin_deconstruct_tuple
* Guts of attribute extraction from an on-disk BRIN tuple.
*
* Its arguments are:
* brdesc BRIN descriptor for the stored tuple
* tp pointer to the tuple data area
* nullbits pointer to the tuple nulls bitmask
* nulls "has nulls" bit in tuple infomask
* values output values, array of size brdesc->bd_totalstored
* allnulls output "allnulls", size brdesc->bd_tupdesc->natts
* hasnulls output "hasnulls", size brdesc->bd_tupdesc->natts
*
* Output arrays must have been allocated by caller.
*/
static inline void
brin_deconstruct_tuple(BrinDesc *brdesc,
char *tp, bits8 *nullbits, bool nulls,
Datum *values, bool *allnulls, bool *hasnulls)
{
int attnum;
int stored;
TupleDesc diskdsc;
long off;
/*
* First iterate to natts to obtain both null flags for each attribute.
* Note that we reverse the sense of the att_isnull test, because we store
* 1 for a null value (rather than a 1 for a not null value as is the
* att_isnull convention used elsewhere.) See brin_form_tuple.
*/
for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++)
{
/*
* the "all nulls" bit means that all values in the page range for
* this column are nulls. Therefore there are no values in the tuple
* data area.
*/
allnulls[attnum] = nulls && !att_isnull(attnum, nullbits);
/*
* the "has nulls" bit means that some tuples have nulls, but others
* have not-null values. Therefore we know the tuple contains data
* for this column.
*
* The hasnulls bits follow the allnulls bits in the same bitmask.
*/
hasnulls[attnum] =
nulls && !att_isnull(brdesc->bd_tupdesc->natts + attnum, nullbits);
}
/*
* Iterate to obtain each attribute's stored values. Note that since we
* may reuse attribute entries for more than one column, we cannot cache
* offsets here.
*/
diskdsc = brtuple_disk_tupdesc(brdesc);
stored = 0;
off = 0;
for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++)
{
int datumno;
if (allnulls[attnum])
{
stored += brdesc->bd_info[attnum]->oi_nstored;
continue;
}
for (datumno = 0;
datumno < brdesc->bd_info[attnum]->oi_nstored;
datumno++)
{
Form_pg_attribute thisatt = diskdsc->attrs[stored];
if (thisatt->attlen == -1)
{
off = att_align_pointer(off, thisatt->attalign, -1,
tp + off);
}
else
{
/* not varlena, so safe to use att_align_nominal */
off = att_align_nominal(off, thisatt->attalign);
}
values[stored++] = fetchatt(thisatt, tp + off);
off = att_addlength_pointer(off, thisatt->attlen, tp + off);
}
}
}
/*
* brin_xlog.c
* XLog replay routines for BRIN indexes
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/access/brin/brin_xlog.c
*/
#include "postgres.h"
#include "access/brin_page.h"
#include "access/brin_pageops.h"
#include "access/brin_xlog.h"
#include "access/xlogutils.h"
/*
* xlog replay routines
*/
static void
brin_xlog_createidx(XLogRecPtr lsn, XLogRecord *record)
{
xl_brin_createidx *xlrec = (xl_brin_createidx *) XLogRecGetData(record);
Buffer buf;
Page page;
/* Backup blocks are not used in create_index records */
Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
/* create the index' metapage */
buf = XLogReadBuffer(xlrec->node, BRIN_METAPAGE_BLKNO, true);
Assert(BufferIsValid(buf));
page = (Page) BufferGetPage(buf);
brin_metapage_init(page, xlrec->pagesPerRange, xlrec->version);
PageSetLSN(page, lsn);
MarkBufferDirty(buf);
UnlockReleaseBuffer(buf);
}
/*
* Common part of an insert or update. Inserts the new tuple and updates the
* revmap.
*/
static void
brin_xlog_insert_update(XLogRecPtr lsn, XLogRecord *record,
xl_brin_insert *xlrec, BrinTuple *tuple)
{
BlockNumber blkno;
Buffer buffer;
Page page;
XLogRedoAction action;
blkno = ItemPointerGetBlockNumber(&xlrec->tid);
/*
* If we inserted the first and only tuple on the page, re-initialize the
* page from scratch.
*/
if (record->xl_info & XLOG_BRIN_INIT_PAGE)
{
XLogReadBufferForRedoExtended(lsn, record, 0,
xlrec->node, MAIN_FORKNUM, blkno,
RBM_ZERO, false, &buffer);
page = BufferGetPage(buffer);
brin_page_init(page, BRIN_PAGETYPE_REGULAR);
action = BLK_NEEDS_REDO;
}
else
{
action = XLogReadBufferForRedo(lsn, record, 0,
xlrec->node, blkno, &buffer);
}
/* insert the index item into the page */
if (action == BLK_NEEDS_REDO)
{
OffsetNumber offnum;
Assert(tuple->bt_blkno == xlrec->heapBlk);
page = (Page) BufferGetPage(buffer);
offnum = ItemPointerGetOffsetNumber(&(xlrec->tid));
if (PageGetMaxOffsetNumber(page) + 1 < offnum)
elog(PANIC, "brin_xlog_insert_update: invalid max offset number");
offnum = PageAddItem(page, (Item) tuple, xlrec->tuplen, offnum, true,
false);
if (offnum == InvalidOffsetNumber)
elog(PANIC, "brin_xlog_insert_update: failed to add tuple");
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
}
if (BufferIsValid(buffer))
UnlockReleaseBuffer(buffer);
/* update the revmap */
action = XLogReadBufferForRedo(lsn, record, 1, xlrec->node,
xlrec->revmapBlk, &buffer);
if (action == BLK_NEEDS_REDO)
{
page = (Page) BufferGetPage(buffer);
brinSetHeapBlockItemptr(buffer, xlrec->pagesPerRange, xlrec->heapBlk,
xlrec->tid);
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
}
if (BufferIsValid(buffer))
UnlockReleaseBuffer(buffer);
/* XXX no FSM updates here ... */
}
/*
* replay a BRIN index insertion
*/
static void
brin_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
{
xl_brin_insert *xlrec = (xl_brin_insert *) XLogRecGetData(record);
BrinTuple *newtup;
newtup = (BrinTuple *) ((char *) xlrec + SizeOfBrinInsert);
brin_xlog_insert_update(lsn, record, xlrec, newtup);
}
/*
* replay a BRIN index update
*/
static void
brin_xlog_update(XLogRecPtr lsn, XLogRecord *record)
{
xl_brin_update *xlrec = (xl_brin_update *) XLogRecGetData(record);
BlockNumber blkno;
Buffer buffer;
BrinTuple *newtup;
XLogRedoAction action;
newtup = (BrinTuple *) ((char *) xlrec + SizeOfBrinUpdate);
/* First remove the old tuple */
blkno = ItemPointerGetBlockNumber(&(xlrec->oldtid));
action = XLogReadBufferForRedo(lsn, record, 2, xlrec->new.node,
blkno, &buffer);
if (action == BLK_NEEDS_REDO)
{
Page page;
OffsetNumber offnum;
page = (Page) BufferGetPage(buffer);
offnum = ItemPointerGetOffsetNumber(&(xlrec->oldtid));
if (PageGetMaxOffsetNumber(page) + 1 < offnum)
elog(PANIC, "brin_xlog_update: invalid max offset number");
PageIndexDeleteNoCompact(page, &offnum, 1);
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
}
/* Then insert the new tuple and update revmap, like in an insertion. */
brin_xlog_insert_update(lsn, record, &xlrec->new, newtup);
if (BufferIsValid(buffer))
UnlockReleaseBuffer(buffer);
}
/*
* Update a tuple on a single page.
*/
static void
brin_xlog_samepage_update(XLogRecPtr lsn, XLogRecord *record)
{
xl_brin_samepage_update *xlrec;
BlockNumber blkno;
Buffer buffer;
XLogRedoAction action;
xlrec = (xl_brin_samepage_update *) XLogRecGetData(record);
blkno = ItemPointerGetBlockNumber(&(xlrec->tid));
action = XLogReadBufferForRedo(lsn, record, 0, xlrec->node, blkno,
&buffer);
if (action == BLK_NEEDS_REDO)
{
int tuplen;
BrinTuple *mmtuple;
Page page;
OffsetNumber offnum;
tuplen = record->xl_len - SizeOfBrinSamepageUpdate;
mmtuple = (BrinTuple *) ((char *) xlrec + SizeOfBrinSamepageUpdate);
page = (Page) BufferGetPage(buffer);
offnum = ItemPointerGetOffsetNumber(&(xlrec->tid));
if (PageGetMaxOffsetNumber(page) + 1 < offnum)
elog(PANIC, "brin_xlog_samepage_update: invalid max offset number");
PageIndexDeleteNoCompact(page, &offnum, 1);
offnum = PageAddItem(page, (Item) mmtuple, tuplen, offnum, true, false);
if (offnum == InvalidOffsetNumber)
elog(PANIC, "brin_xlog_samepage_update: failed to add tuple");
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
}
if (BufferIsValid(buffer))
UnlockReleaseBuffer(buffer);
/* XXX no FSM updates here ... */
}
/*
* Replay a revmap page extension
*/
static void
brin_xlog_revmap_extend(XLogRecPtr lsn, XLogRecord *record)
{
xl_brin_revmap_extend *xlrec;
Buffer metabuf;
Buffer buf;
Page page;
XLogRedoAction action;
xlrec = (xl_brin_revmap_extend *) XLogRecGetData(record);
/* Update the metapage */
action = XLogReadBufferForRedo(lsn, record, 0, xlrec->node,
BRIN_METAPAGE_BLKNO, &metabuf);
if (action == BLK_NEEDS_REDO)
{
Page metapg;
BrinMetaPageData *metadata;
metapg = BufferGetPage(metabuf);
metadata = (BrinMetaPageData *) PageGetContents(metapg);
Assert(metadata->lastRevmapPage == xlrec->targetBlk - 1);
metadata->lastRevmapPage = xlrec->targetBlk;
PageSetLSN(metapg, lsn);
MarkBufferDirty(metabuf);
}
/*
* Re-init the target block as a revmap page. There's never a full- page
* image here.
*/
buf = XLogReadBuffer(xlrec->node, xlrec->targetBlk, true);
page = (Page) BufferGetPage(buf);
brin_page_init(page, BRIN_PAGETYPE_REVMAP);
PageSetLSN(page, lsn);
MarkBufferDirty(buf);
UnlockReleaseBuffer(buf);
if (BufferIsValid(metabuf))
UnlockReleaseBuffer(metabuf);
}
void
brin_redo(XLogRecPtr lsn, XLogRecord *record)
{
uint8 info = record->xl_info & ~XLR_INFO_MASK;
switch (info & XLOG_BRIN_OPMASK)
{
case XLOG_BRIN_CREATE_INDEX:
brin_xlog_createidx(lsn, record);
break;
case XLOG_BRIN_INSERT:
brin_xlog_insert(lsn, record);
break;
case XLOG_BRIN_UPDATE:
brin_xlog_update(lsn, record);
break;
case XLOG_BRIN_SAMEPAGE_UPDATE:
brin_xlog_samepage_update(lsn, record);
break;
case XLOG_BRIN_REVMAP_EXTEND:
brin_xlog_revmap_extend(lsn, record);
break;
default:
elog(PANIC, "brin_redo: unknown op code %u", info);
}
}
......@@ -209,6 +209,13 @@ static relopt_int intRelOpts[] =
RELOPT_KIND_HEAP | RELOPT_KIND_TOAST
}, -1, 0, 2000000000
},
{
{
"pages_per_range",
"Number of pages that each page range covers in a BRIN index",
RELOPT_KIND_BRIN
}, 128, 1, 131072
},
/* list terminator */
{{NULL}}
......
......@@ -272,6 +272,8 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan)
scan->rs_startblock = 0;
}
scan->rs_initblock = 0;
scan->rs_numblocks = InvalidBlockNumber;
scan->rs_inited = false;
scan->rs_ctup.t_data = NULL;
ItemPointerSetInvalid(&scan->rs_ctup.t_self);
......@@ -297,6 +299,14 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan)
pgstat_count_heap_scan(scan->rs_rd);
}
void
heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, BlockNumber numBlks)
{
scan->rs_startblock = startBlk;
scan->rs_initblock = startBlk;
scan->rs_numblocks = numBlks;
}
/*
* heapgetpage - subroutine for heapgettup()
*
......@@ -637,7 +647,8 @@ heapgettup(HeapScanDesc scan,
*/
if (backward)
{
finished = (page == scan->rs_startblock);
finished = (page == scan->rs_startblock) ||
(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false);
if (page == 0)
page = scan->rs_nblocks;
page--;
......@@ -647,7 +658,8 @@ heapgettup(HeapScanDesc scan,
page++;
if (page >= scan->rs_nblocks)
page = 0;
finished = (page == scan->rs_startblock);
finished = (page == scan->rs_startblock) ||
(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false);
/*
* Report our new scan position for synchronization purposes. We
......@@ -898,7 +910,8 @@ heapgettup_pagemode(HeapScanDesc scan,
*/
if (backward)
{
finished = (page == scan->rs_startblock);
finished = (page == scan->rs_startblock) ||
(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false);
if (page == 0)
page = scan->rs_nblocks;
page--;
......@@ -908,7 +921,8 @@ heapgettup_pagemode(HeapScanDesc scan,
page++;
if (page >= scan->rs_nblocks)
page = 0;
finished = (page == scan->rs_startblock);
finished = (page == scan->rs_startblock) ||
(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false);
/*
* Report our new scan position for synchronization purposes. We
......
......@@ -8,7 +8,8 @@ subdir = src/backend/access/rmgrdesc
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
OBJS = clogdesc.o dbasedesc.o gindesc.o gistdesc.o hashdesc.o heapdesc.o \
OBJS = brindesc.o clogdesc.o dbasedesc.o gindesc.o gistdesc.o \
hashdesc.o heapdesc.o \
mxactdesc.o nbtdesc.o relmapdesc.o seqdesc.o smgrdesc.o spgdesc.o \
standbydesc.o tblspcdesc.o xactdesc.o xlogdesc.o
......
/*-------------------------------------------------------------------------
*
* brindesc.c
* rmgr descriptor routines for BRIN indexes
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/access/rmgrdesc/brindesc.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/brin_xlog.h"
void
brin_desc(StringInfo buf, XLogRecord *record)
{
char *rec = XLogRecGetData(record);
uint8 info = record->xl_info & ~XLR_INFO_MASK;
info &= XLOG_BRIN_OPMASK;
if (info == XLOG_BRIN_CREATE_INDEX)
{
xl_brin_createidx *xlrec = (xl_brin_createidx *) rec;
appendStringInfo(buf, "v%d pagesPerRange %u rel %u/%u/%u",
xlrec->version, xlrec->pagesPerRange,
xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode);
}
else if (info == XLOG_BRIN_INSERT)
{
xl_brin_insert *xlrec = (xl_brin_insert *) rec;
appendStringInfo(buf, "rel %u/%u/%u heapBlk %u revmapBlk %u pagesPerRange %u TID (%u,%u)",
xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode,
xlrec->heapBlk, xlrec->revmapBlk,
xlrec->pagesPerRange,
ItemPointerGetBlockNumber(&xlrec->tid),
ItemPointerGetOffsetNumber(&xlrec->tid));
}
else if (info == XLOG_BRIN_UPDATE)
{
xl_brin_update *xlrec = (xl_brin_update *) rec;
appendStringInfo(buf, "rel %u/%u/%u heapBlk %u revmapBlk %u pagesPerRange %u old TID (%u,%u) TID (%u,%u)",
xlrec->new.node.spcNode, xlrec->new.node.dbNode,
xlrec->new.node.relNode,
xlrec->new.heapBlk, xlrec->new.revmapBlk,
xlrec->new.pagesPerRange,
ItemPointerGetBlockNumber(&xlrec->oldtid),
ItemPointerGetOffsetNumber(&xlrec->oldtid),
ItemPointerGetBlockNumber(&xlrec->new.tid),
ItemPointerGetOffsetNumber(&xlrec->new.tid));
}
else if (info == XLOG_BRIN_SAMEPAGE_UPDATE)
{
xl_brin_samepage_update *xlrec = (xl_brin_samepage_update *) rec;
appendStringInfo(buf, "rel %u/%u/%u TID (%u,%u)",
xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode,
ItemPointerGetBlockNumber(&xlrec->tid),
ItemPointerGetOffsetNumber(&xlrec->tid));
}
else if (info == XLOG_BRIN_REVMAP_EXTEND)
{
xl_brin_revmap_extend *xlrec = (xl_brin_revmap_extend *) rec;
appendStringInfo(buf, "rel %u/%u/%u targetBlk %u",
xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode, xlrec->targetBlk);
}
}
const char *
brin_identify(uint8 info)
{
const char *id = NULL;
switch (info & ~XLR_INFO_MASK)
{
case XLOG_BRIN_CREATE_INDEX:
id = "CREATE_INDEX";
break;
case XLOG_BRIN_INSERT:
id = "INSERT";
break;
case XLOG_BRIN_INSERT | XLOG_BRIN_INIT_PAGE:
id = "INSERT+INIT";
break;
case XLOG_BRIN_UPDATE:
id = "UPDATE";
break;
case XLOG_BRIN_UPDATE | XLOG_BRIN_INIT_PAGE:
id = "UPDATE+INIT";
break;
case XLOG_BRIN_SAMEPAGE_UPDATE:
id = "SAMEPAGE_UPDATE";
break;
case XLOG_BRIN_REVMAP_EXTEND:
id = "REVMAP_EXTEND";
break;
}
return id;
}
......@@ -12,6 +12,7 @@
#include "access/gist_private.h"
#include "access/hash.h"
#include "access/heapam_xlog.h"
#include "access/brin_xlog.h"
#include "access/multixact.h"
#include "access/nbtree.h"
#include "access/spgist.h"
......
......@@ -2103,6 +2103,27 @@ IndexBuildHeapScan(Relation heapRelation,
bool allow_sync,
IndexBuildCallback callback,
void *callback_state)
{
return IndexBuildHeapRangeScan(heapRelation, indexRelation,
indexInfo, allow_sync,
0, InvalidBlockNumber,
callback, callback_state);
}
/*
* As above, except that instead of scanning the complete heap, only the given
* number of blocks are scanned. Scan to end-of-rel can be signalled by
* passing InvalidBlockNumber as numblocks.
*/
double
IndexBuildHeapRangeScan(Relation heapRelation,
Relation indexRelation,
IndexInfo *indexInfo,
bool allow_sync,
BlockNumber start_blockno,
BlockNumber numblocks,
IndexBuildCallback callback,
void *callback_state)
{
bool is_system_catalog;
bool checking_uniqueness;
......@@ -2174,6 +2195,9 @@ IndexBuildHeapScan(Relation heapRelation,
true, /* buffer access strategy OK */
allow_sync); /* syncscan OK? */
/* set our scan endpoints */
heap_setscanlimits(scan, start_blockno, numblocks);
reltuples = 0;
/*
......
......@@ -132,6 +132,7 @@ LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogRecord *record)
case RM_GIST_ID:
case RM_SEQ_ID:
case RM_SPGIST_ID:
case RM_BRIN_ID:
break;
case RM_NEXT_ID:
elog(ERROR, "unexpected RM_NEXT_ID rmgr_id: %u", (RmgrIds) buf.record.xl_rmid);
......
......@@ -399,7 +399,8 @@ PageRestoreTempPage(Page tempPage, Page oldPage)
}
/*
* sorting support for PageRepairFragmentation and PageIndexMultiDelete
* sorting support for PageRepairFragmentation, PageIndexMultiDelete,
* PageIndexDeleteNoCompact
*/
typedef struct itemIdSortData
{
......@@ -896,6 +897,182 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
phdr->pd_upper = upper;
}
/*
* PageIndexDeleteNoCompact
* Delete the given items for an index page, and defragment the resulting
* free space, but do not compact the item pointers array.
*
* itemnos is the array of tuples to delete; nitems is its size. maxIdxTuples
* is the maximum number of tuples that can exist in a page.
*
* Unused items at the end of the array are removed.
*
* This is used for index AMs that require that existing TIDs of live tuples
* remain unchanged.
*/
void
PageIndexDeleteNoCompact(Page page, OffsetNumber *itemnos, int nitems)
{
PageHeader phdr = (PageHeader) page;
LocationIndex pd_lower = phdr->pd_lower;
LocationIndex pd_upper = phdr->pd_upper;
LocationIndex pd_special = phdr->pd_special;
int nline;
bool empty;
OffsetNumber offnum;
int nextitm;
/*
* As with PageRepairFragmentation, paranoia seems justified.
*/
if (pd_lower < SizeOfPageHeaderData ||
pd_lower > pd_upper ||
pd_upper > pd_special ||
pd_special > BLCKSZ ||
pd_special != MAXALIGN(pd_special))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
pd_lower, pd_upper, pd_special)));
/*
* Scan the existing item pointer array and mark as unused those that are
* in our kill-list; make sure any non-interesting ones are marked unused
* as well.
*/
nline = PageGetMaxOffsetNumber(page);
empty = true;
nextitm = 0;
for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum))
{
ItemId lp;
ItemLength itemlen;
ItemOffset offset;
lp = PageGetItemId(page, offnum);
itemlen = ItemIdGetLength(lp);
offset = ItemIdGetOffset(lp);
if (ItemIdIsUsed(lp))
{
if (offset < pd_upper ||
(offset + itemlen) > pd_special ||
offset != MAXALIGN(offset))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("corrupted item pointer: offset = %u, length = %u",
offset, (unsigned int) itemlen)));
if (nextitm < nitems && offnum == itemnos[nextitm])
{
/* this one is on our list to delete, so mark it unused */
ItemIdSetUnused(lp);
nextitm++;
}
else if (ItemIdHasStorage(lp))
{
/* This one's live -- must do the compaction dance */
empty = false;
}
else
{
/* get rid of this one too */
ItemIdSetUnused(lp);
}
}
}
/* this will catch invalid or out-of-order itemnos[] */
if (nextitm != nitems)
elog(ERROR, "incorrect index offsets supplied");
if (empty)
{
/* Page is completely empty, so just reset it quickly */
phdr->pd_lower = SizeOfPageHeaderData;
phdr->pd_upper = pd_special;
}
else
{
/* There are live items: need to compact the page the hard way */
itemIdSortData itemidbase[MaxOffsetNumber];
itemIdSort itemidptr;
int i;
Size totallen;
Offset upper;
/*
* Scan the page taking note of each item that we need to preserve.
* This includes both live items (those that contain data) and
* interspersed unused ones. It's critical to preserve these unused
* items, because otherwise the offset numbers for later live items
* would change, which is not acceptable. Unused items might get used
* again later; that is fine.
*/
itemidptr = itemidbase;
totallen = 0;
for (i = 0; i < nline; i++, itemidptr++)
{
ItemId lp;
itemidptr->offsetindex = i;
lp = PageGetItemId(page, i + 1);
if (ItemIdHasStorage(lp))
{
itemidptr->itemoff = ItemIdGetOffset(lp);
itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp));
totallen += itemidptr->alignedlen;
}
else
{
itemidptr->itemoff = 0;
itemidptr->alignedlen = 0;
}
}
/* By here, there are exactly nline elements in itemidbase array */
if (totallen > (Size) (pd_special - pd_lower))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("corrupted item lengths: total %u, available space %u",
(unsigned int) totallen, pd_special - pd_lower)));
/* sort itemIdSortData array into decreasing itemoff order */
qsort((char *) itemidbase, nline, sizeof(itemIdSortData),
itemoffcompare);
/*
* Defragment the data areas of each tuple, being careful to preserve
* each item's position in the linp array.
*/
upper = pd_special;
PageClearHasFreeLinePointers(page);
for (i = 0, itemidptr = itemidbase; i < nline; i++, itemidptr++)
{
ItemId lp;
lp = PageGetItemId(page, itemidptr->offsetindex + 1);
if (itemidptr->alignedlen == 0)
{
PageSetHasFreeLinePointers(page);
ItemIdSetUnused(lp);
continue;
}
upper -= itemidptr->alignedlen;
memmove((char *) page + upper,
(char *) page + itemidptr->itemoff,
itemidptr->alignedlen);
lp->lp_off = upper;
/* lp_flags and lp_len remain the same as originally */
}
/* Set the new page limits */
phdr->pd_upper = upper;
phdr->pd_lower = SizeOfPageHeaderData + i * sizeof(ItemIdData);
}
}
/*
* Set checksum for a page in shared buffers.
......
......@@ -6081,7 +6081,7 @@ genericcostestimate(PlannerInfo *root,
else
numIndexPages = 1.0;
/* fetch estimated page cost for schema containing index */
/* fetch estimated page cost for tablespace containing index */
get_tablespace_page_costs(index->reltablespace,
&spc_random_page_cost,
NULL);
......@@ -7162,7 +7162,7 @@ gincostestimate(PG_FUNCTION_ARGS)
JOIN_INNER,
NULL);
/* fetch estimated page cost for schema containing index */
/* fetch estimated page cost for tablespace containing index */
get_tablespace_page_costs(index->reltablespace,
&spc_random_page_cost,
NULL);
......@@ -7349,3 +7349,73 @@ gincostestimate(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
/*
* BRIN has search behavior completely different from other index types
*/
Datum
brincostestimate(PG_FUNCTION_ARGS)
{
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
IndexPath *path = (IndexPath *) PG_GETARG_POINTER(1);
double loop_count = PG_GETARG_FLOAT8(2);
Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
double *indexCorrelation = (double *) PG_GETARG_POINTER(6);
IndexOptInfo *index = path->indexinfo;
List *indexQuals = path->indexquals;
List *indexOrderBys = path->indexorderbys;
double numPages = index->pages;
double numTuples = index->tuples;
Cost spc_seq_page_cost;
Cost spc_random_page_cost;
QualCost index_qual_cost;
double qual_op_cost;
double qual_arg_cost;
/* fetch estimated page cost for tablespace containing index */
get_tablespace_page_costs(index->reltablespace,
&spc_random_page_cost,
&spc_seq_page_cost);
/*
* BRIN indexes are always read in full; use that as startup cost.
* XXX maybe only include revmap pages here?
*/
*indexStartupCost = spc_seq_page_cost * numPages * loop_count;
/*
* To read a BRIN index there might be a bit of back and forth over regular
* pages, as revmap might point to them out of sequential order; calculate
* this as reading the whole index in random order.
*/
*indexTotalCost = spc_random_page_cost * numPages * loop_count;
*indexSelectivity =
clauselist_selectivity(root, path->indexquals,
path->indexinfo->rel->relid,
JOIN_INNER, NULL);
*indexCorrelation = 1;
/*
* Add on index qual eval costs, much as in genericcostestimate.
*/
cost_qual_eval(&index_qual_cost, indexQuals, root);
qual_arg_cost = index_qual_cost.startup + index_qual_cost.per_tuple;
cost_qual_eval(&index_qual_cost, indexOrderBys, root);
qual_arg_cost += index_qual_cost.startup + index_qual_cost.per_tuple;
qual_op_cost = cpu_operator_cost *
(list_length(indexQuals) + list_length(indexOrderBys));
qual_arg_cost -= qual_op_cost;
if (qual_arg_cost < 0) /* just in case... */
qual_arg_cost = 0;
*indexStartupCost += qual_arg_cost;
*indexTotalCost += qual_arg_cost;
*indexTotalCost += (numTuples * *indexSelectivity) * (cpu_index_tuple_cost + qual_op_cost);
/* XXX what about pages_per_range? */
PG_RETURN_VOID();
}
/*
* AM-callable functions for BRIN indexes
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/include/access/brin.h
*/
#ifndef BRIN_H
#define BRIN_H
#include "fmgr.h"
#include "nodes/execnodes.h"
#include "utils/relcache.h"
/*
* prototypes for functions in brin.c (external entry points for BRIN)
*/
extern Datum brinbuild(PG_FUNCTION_ARGS);
extern Datum brinbuildempty(PG_FUNCTION_ARGS);
extern Datum brininsert(PG_FUNCTION_ARGS);
extern Datum brinbeginscan(PG_FUNCTION_ARGS);
extern Datum bringettuple(PG_FUNCTION_ARGS);
extern Datum bringetbitmap(PG_FUNCTION_ARGS);
extern Datum brinrescan(PG_FUNCTION_ARGS);
extern Datum brinendscan(PG_FUNCTION_ARGS);
extern Datum brinmarkpos(PG_FUNCTION_ARGS);
extern Datum brinrestrpos(PG_FUNCTION_ARGS);
extern Datum brinbulkdelete(PG_FUNCTION_ARGS);
extern Datum brinvacuumcleanup(PG_FUNCTION_ARGS);
extern Datum brincanreturn(PG_FUNCTION_ARGS);
extern Datum brincostestimate(PG_FUNCTION_ARGS);
extern Datum brinoptions(PG_FUNCTION_ARGS);
/*
* Storage type for BRIN's reloptions
*/
typedef struct BrinOptions
{
int32 vl_len_; /* varlena header (do not touch directly!) */
BlockNumber pagesPerRange;
} BrinOptions;
#define BRIN_DEFAULT_PAGES_PER_RANGE 128
#define BrinGetPagesPerRange(relation) \
((relation)->rd_options ? \
((BrinOptions *) (relation)->rd_options)->pagesPerRange : \
BRIN_DEFAULT_PAGES_PER_RANGE)
#endif /* BRIN_H */
/*
* brin_internal.h
* internal declarations for BRIN indexes
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/include/access/brin_internal.h
*/
#ifndef BRIN_INTERNAL_H
#define BRIN_INTERNAL_H
#include "fmgr.h"
#include "storage/buf.h"
#include "storage/bufpage.h"
#include "storage/off.h"
#include "utils/relcache.h"
/*
* A BrinDesc is a struct designed to enable decoding a BRIN tuple from the
* on-disk format to an in-memory tuple and vice-versa.
*/
/* struct returned by "OpcInfo" amproc */
typedef struct BrinOpcInfo
{
/* Number of columns stored in an index column of this opclass */
uint16 oi_nstored;
/* Opaque pointer for the opclass' private use */
void *oi_opaque;
/* Type IDs of the stored columns */
Oid oi_typids[FLEXIBLE_ARRAY_MEMBER];
} BrinOpcInfo;
/* the size of a BrinOpcInfo for the given number of columns */
#define SizeofBrinOpcInfo(ncols) \
(offsetof(BrinOpcInfo, oi_typids) + sizeof(Oid) * ncols)
typedef struct BrinDesc
{
/* Containing memory context */
MemoryContext bd_context;
/* the index relation itself */
Relation bd_index;
/* tuple descriptor of the index relation */
TupleDesc bd_tupdesc;
/* cached copy for on-disk tuples; generated at first use */
TupleDesc bd_disktdesc;
/* total number of Datum entries that are stored on-disk for all columns */
int bd_totalstored;
/* per-column info; bd_tupdesc->natts entries long */
BrinOpcInfo *bd_info[FLEXIBLE_ARRAY_MEMBER];
} BrinDesc;
/*
* Globally-known function support numbers for BRIN indexes. Individual
* opclasses define their own function support numbers, which must not collide
* with the definitions here.
*/
#define BRIN_PROCNUM_OPCINFO 1
#define BRIN_PROCNUM_ADDVALUE 2
#define BRIN_PROCNUM_CONSISTENT 3
#define BRIN_PROCNUM_UNION 4
/* procedure numbers up to 10 are reserved for BRIN future expansion */
#define BRIN_DEBUG
/* we allow debug if using GCC; otherwise don't bother */
#if defined(BRIN_DEBUG) && defined(__GNUC__)
#define BRIN_elog(level, ...) elog(level, __VA_ARGS__)
#else
#define BRIN_elog(a) void(0)
#endif
/* brin.c */
extern BrinDesc *brin_build_desc(Relation rel);
extern void brin_free_desc(BrinDesc *bdesc);
#endif /* BRIN_INTERNAL_H */
/*
* brin_page.h
* Prototypes and definitions for BRIN page layouts
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/include/access/brin_page.h
*
* NOTES
*
* These structs should really be private to specific BRIN files, but it's
* useful to have them here so that they can be used by pageinspect and similar
* tools.
*/
#ifndef BRIN_PAGE_H
#define BRIN_PAGE_H
#include "storage/block.h"
#include "storage/itemptr.h"
/* special space on all BRIN pages stores a "type" identifier */
#define BRIN_PAGETYPE_META 0xF091
#define BRIN_PAGETYPE_REVMAP 0xF092
#define BRIN_PAGETYPE_REGULAR 0xF093
#define BRIN_PAGE_TYPE(page) \
(((BrinSpecialSpace *) PageGetSpecialPointer(page))->type)
#define BRIN_IS_REVMAP_PAGE(page) (BRIN_PAGE_TYPE(page) == BRIN_PAGETYPE_REVMAP)
#define BRIN_IS_REGULAR_PAGE(page) (BRIN_PAGE_TYPE(page) == BRIN_PAGETYPE_REGULAR)
/* flags for BrinSpecialSpace */
#define BRIN_EVACUATE_PAGE (1 << 0)
typedef struct BrinSpecialSpace
{
uint16 flags;
uint16 type;
} BrinSpecialSpace;
/* Metapage definitions */
typedef struct BrinMetaPageData
{
uint32 brinMagic;
uint32 brinVersion;
BlockNumber pagesPerRange;
BlockNumber lastRevmapPage;
} BrinMetaPageData;
#define BRIN_CURRENT_VERSION 1
#define BRIN_META_MAGIC 0xA8109CFA
#define BRIN_METAPAGE_BLKNO 0
/* Definitions for revmap pages */
typedef struct RevmapContents
{
ItemPointerData rm_tids[1]; /* really REVMAP_PAGE_MAXITEMS */
} RevmapContents;
#define REVMAP_CONTENT_SIZE \
(BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \
offsetof(RevmapContents, rm_tids) - \
MAXALIGN(sizeof(BrinSpecialSpace)))
/* max num of items in the array */
#define REVMAP_PAGE_MAXITEMS \
(REVMAP_CONTENT_SIZE / sizeof(ItemPointerData))
#endif /* BRIN_PAGE_H */
/*
* brin_pageops.h
* Prototypes for operating on BRIN pages.
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/include/access/brin_pageops.h
*/
#ifndef BRIN_PAGEOPS_H
#define BRIN_PAGEOPS_H
#include "access/brin_revmap.h"
extern bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
BrinRevmap *revmap, BlockNumber heapBlk,
Buffer oldbuf, OffsetNumber oldoff,
const BrinTuple *origtup, Size origsz,
const BrinTuple *newtup, Size newsz,
bool samepage);
extern bool brin_can_do_samepage_update(Buffer buffer, Size origsz,
Size newsz);
extern OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk,
BrinTuple *tup, Size itemsz);
extern void brin_page_init(Page page, uint16 type);
extern void brin_metapage_init(Page page, BlockNumber pagesPerRange,
uint16 version);
extern bool brin_start_evacuating_page(Relation idxRel, Buffer buf);
extern void brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
BrinRevmap *revmap, Buffer buf);
#endif /* BRIN_PAGEOPS_H */
/*
* brin_revmap.h
* Prototypes for BRIN reverse range maps
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/include/access/brin_revmap.h
*/
#ifndef BRIN_REVMAP_H
#define BRIN_REVMAP_H
#include "access/brin_tuple.h"
#include "storage/block.h"
#include "storage/buf.h"
#include "storage/itemptr.h"
#include "storage/off.h"
#include "utils/relcache.h"
/* struct definition lives in brin_revmap.c */
typedef struct BrinRevmap BrinRevmap;
extern BrinRevmap *brinRevmapInitialize(Relation idxrel,
BlockNumber *pagesPerRange);
extern void brinRevmapTerminate(BrinRevmap *revmap);
extern void brinRevmapExtend(BrinRevmap *revmap,
BlockNumber heapBlk);
extern Buffer brinLockRevmapPageForUpdate(BrinRevmap *revmap,
BlockNumber heapBlk);
extern void brinSetHeapBlockItemptr(Buffer rmbuf, BlockNumber pagesPerRange,
BlockNumber heapBlk, ItemPointerData tid);
extern BrinTuple *brinGetTupleForHeapBlock(BrinRevmap *revmap,
BlockNumber heapBlk, Buffer *buf, OffsetNumber *off,
Size *size, int mode);
#endif /* BRIN_REVMAP_H */
/*
* brin_tuple.h
* Declarations for dealing with BRIN-specific tuples.
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/include/access/brin_tuple.h
*/
#ifndef BRIN_TUPLE_H
#define BRIN_TUPLE_H
#include "access/brin_internal.h"
#include "access/tupdesc.h"
/*
* A BRIN index stores one index tuple per page range. Each index tuple
* has one BrinValues struct for each indexed column; in turn, each BrinValues
* has (besides the null flags) an array of Datum whose size is determined by
* the opclass.
*/
typedef struct BrinValues
{
AttrNumber bv_attno; /* index attribute number */
bool bv_hasnulls; /* is there any nulls in the page range? */
bool bv_allnulls; /* are all values nulls in the page range? */
Datum *bv_values; /* current accumulated values */
} BrinValues;
/*
* This struct is used to represent an in-memory index tuple. The values can
* only be meaningfully decoded with an appropriate BrinDesc.
*/
typedef struct BrinMemTuple
{
bool bt_placeholder; /* this is a placeholder tuple */
BlockNumber bt_blkno; /* heap blkno that the tuple is for */
MemoryContext bt_context; /* memcxt holding the dt_column values */
BrinValues bt_columns[FLEXIBLE_ARRAY_MEMBER];
} BrinMemTuple;
/*
* An on-disk BRIN tuple. This is possibly followed by a nulls bitmask, with
* room for 2 null bits (two bits for each indexed column); an opclass-defined
* number of Datum values for each column follow.
*/
typedef struct BrinTuple
{
/* heap block number that the tuple is for */
BlockNumber bt_blkno;
/* ---------------
* mt_info is laid out in the following fashion:
*
* 7th (high) bit: has nulls
* 6th bit: is placeholder tuple
* 5th bit: unused
* 4-0 bit: offset of data
* ---------------
*/
uint8 bt_info;
} BrinTuple;
#define SizeOfBrinTuple (offsetof(BrinTuple, bt_info) + sizeof(uint8))
/*
* t_info manipulation macros
*/
#define BRIN_OFFSET_MASK 0x1F
/* bit 0x20 is not used at present */
#define BRIN_PLACEHOLDER_MASK 0x40
#define BRIN_NULLS_MASK 0x80
#define BrinTupleDataOffset(tup) ((Size) (((BrinTuple *) (tup))->bt_info & BRIN_OFFSET_MASK))
#define BrinTupleHasNulls(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_NULLS_MASK)) != 0)
#define BrinTupleIsPlaceholder(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_PLACEHOLDER_MASK)) != 0)
extern BrinTuple *brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno,
BrinMemTuple *tuple, Size *size);
extern BrinTuple *brin_form_placeholder_tuple(BrinDesc *brdesc,
BlockNumber blkno, Size *size);
extern void brin_free_tuple(BrinTuple *tuple);
extern BrinTuple *brin_copy_tuple(BrinTuple *tuple, Size len);
extern bool brin_tuples_equal(const BrinTuple *a, Size alen,
const BrinTuple *b, Size blen);
extern BrinMemTuple *brin_new_memtuple(BrinDesc *brdesc);
extern void brin_memtuple_initialize(BrinMemTuple *dtuple,
BrinDesc *brdesc);
extern BrinMemTuple *brin_deform_tuple(BrinDesc *brdesc,
BrinTuple *tuple);
#endif /* BRIN_TUPLE_H */
/*-------------------------------------------------------------------------
*
* brin_xlog.h
* POSTGRES BRIN access XLOG definitions.
*
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/access/brin_xlog.h
*
*-------------------------------------------------------------------------
*/
#ifndef BRIN_XLOG_H
#define BRIN_XLOG_H
#include "access/xlogrecord.h"
#include "lib/stringinfo.h"
#include "storage/bufpage.h"
#include "storage/itemptr.h"
#include "storage/relfilenode.h"
#include "utils/relcache.h"
/*
* WAL record definitions for BRIN's WAL operations
*
* XLOG allows to store some information in high 4 bits of log
* record xl_info field.
*/
#define XLOG_BRIN_CREATE_INDEX 0x00
#define XLOG_BRIN_INSERT 0x10
#define XLOG_BRIN_UPDATE 0x20
#define XLOG_BRIN_SAMEPAGE_UPDATE 0x30
#define XLOG_BRIN_REVMAP_EXTEND 0x40
#define XLOG_BRIN_REVMAP_VACUUM 0x50
#define XLOG_BRIN_OPMASK 0x70
/*
* When we insert the first item on a new page, we restore the entire page in
* redo.
*/
#define XLOG_BRIN_INIT_PAGE 0x80
/* This is what we need to know about a BRIN index create */
typedef struct xl_brin_createidx
{
BlockNumber pagesPerRange;
RelFileNode node;
uint16 version;
} xl_brin_createidx;
#define SizeOfBrinCreateIdx (offsetof(xl_brin_createidx, version) + sizeof(uint16))
/*
* This is what we need to know about a BRIN tuple insert
*/
typedef struct xl_brin_insert
{
RelFileNode node;
BlockNumber heapBlk;
/* extra information needed to update the revmap */
BlockNumber revmapBlk;
BlockNumber pagesPerRange;
uint16 tuplen;
ItemPointerData tid;
/* tuple data follows at end of struct */
} xl_brin_insert;
#define SizeOfBrinInsert (offsetof(xl_brin_insert, tid) + sizeof(ItemPointerData))
/*
* A cross-page update is the same as an insert, but also store the old tid.
*/
typedef struct xl_brin_update
{
ItemPointerData oldtid;
xl_brin_insert new;
} xl_brin_update;
#define SizeOfBrinUpdate (offsetof(xl_brin_update, new) + SizeOfBrinInsert)
/* This is what we need to know about a BRIN tuple samepage update */
typedef struct xl_brin_samepage_update
{
RelFileNode node;
ItemPointerData tid;
/* tuple data follows at end of struct */
} xl_brin_samepage_update;
#define SizeOfBrinSamepageUpdate (offsetof(xl_brin_samepage_update, tid) + sizeof(ItemPointerData))
/* This is what we need to know about a revmap extension */
typedef struct xl_brin_revmap_extend
{
RelFileNode node;
BlockNumber targetBlk;
} xl_brin_revmap_extend;
#define SizeOfBrinRevmapExtend (offsetof(xl_brin_revmap_extend, targetBlk) + \
sizeof(BlockNumber))
extern void brin_desc(StringInfo buf, XLogRecord *record);
extern void brin_redo(XLogRecPtr lsn, XLogRecord *record);
extern const char *brin_identify(uint8 info);
#endif /* BRIN_XLOG_H */
......@@ -113,6 +113,8 @@ extern HeapScanDesc heap_beginscan_strat(Relation relation, Snapshot snapshot,
bool allow_strat, bool allow_sync);
extern HeapScanDesc heap_beginscan_bm(Relation relation, Snapshot snapshot,
int nkeys, ScanKey key);
extern void heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk,
BlockNumber endBlk);
extern void heap_rescan(HeapScanDesc scan, ScanKey key);
extern void heap_endscan(HeapScanDesc scan);
extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction);
......
......@@ -45,8 +45,9 @@ typedef enum relopt_kind
RELOPT_KIND_TABLESPACE = (1 << 7),
RELOPT_KIND_SPGIST = (1 << 8),
RELOPT_KIND_VIEW = (1 << 9),
RELOPT_KIND_BRIN = (1 << 10),
/* if you add a new kind, make sure you update "last_default" too */
RELOPT_KIND_LAST_DEFAULT = RELOPT_KIND_VIEW,
RELOPT_KIND_LAST_DEFAULT = RELOPT_KIND_BRIN,
/* some compilers treat enums as signed ints, so we can't use 1 << 31 */
RELOPT_KIND_MAX = (1 << 30)
} relopt_kind;
......
......@@ -35,8 +35,10 @@ typedef struct HeapScanDescData
bool rs_temp_snap; /* unregister snapshot at scan end? */
/* state set up at initscan time */
BlockNumber rs_nblocks; /* number of blocks to scan */
BlockNumber rs_nblocks; /* total number of blocks in rel */
BlockNumber rs_startblock; /* block # to start at */
BlockNumber rs_initblock; /* block # to consider initial of rel */
BlockNumber rs_numblocks; /* number of blocks to scan */
BufferAccessStrategy rs_strategy; /* access strategy for reads */
bool rs_syncscan; /* report location to syncscan logic? */
......
......@@ -42,3 +42,4 @@ PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gi
PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup)
PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL)
PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup)
PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL)
......@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 201411041
#define CATALOG_VERSION_NO 201411071
#endif
......@@ -98,6 +98,14 @@ extern double IndexBuildHeapScan(Relation heapRelation,
bool allow_sync,
IndexBuildCallback callback,
void *callback_state);
extern double IndexBuildHeapRangeScan(Relation heapRelation,
Relation indexRelation,
IndexInfo *indexInfo,
bool allow_sync,
BlockNumber start_blockno,
BlockNumber end_blockno,
IndexBuildCallback callback,
void *callback_state);
extern void validate_index(Oid heapId, Oid indexId, Snapshot snapshot);
......
......@@ -132,5 +132,7 @@ DESCR("GIN index access method");
DATA(insert OID = 4000 ( spgist 0 5 f f f f f t f t f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions ));
DESCR("SP-GiST index access method");
#define SPGIST_AM_OID 4000
DATA(insert OID = 3580 ( brin 5 14 f f f f t t f t t f f 0 brininsert brinbeginscan - bringetbitmap brinrescan brinendscan brinmarkpos brinrestrpos brinbuild brinbuildempty brinbulkdelete brinvacuumcleanup - brincostestimate brinoptions ));
#define BRIN_AM_OID 3580
#endif /* PG_AM_H */
......@@ -845,4 +845,168 @@ DATA(insert ( 3550 869 869 25 s 932 783 0 ));
DATA(insert ( 3550 869 869 26 s 933 783 0 ));
DATA(insert ( 3550 869 869 27 s 934 783 0 ));
/* BRIN opclasses */
/* minmax bytea */
DATA(insert ( 4064 17 17 1 s 1957 3580 0 ));
DATA(insert ( 4064 17 17 2 s 1958 3580 0 ));
DATA(insert ( 4064 17 17 3 s 1955 3580 0 ));
DATA(insert ( 4064 17 17 4 s 1960 3580 0 ));
DATA(insert ( 4064 17 17 5 s 1959 3580 0 ));
/* minmax "char" */
DATA(insert ( 4062 18 18 1 s 631 3580 0 ));
DATA(insert ( 4062 18 18 2 s 632 3580 0 ));
DATA(insert ( 4062 18 18 3 s 92 3580 0 ));
DATA(insert ( 4062 18 18 4 s 634 3580 0 ));
DATA(insert ( 4062 18 18 5 s 633 3580 0 ));
/* minmax name */
DATA(insert ( 4065 19 19 1 s 660 3580 0 ));
DATA(insert ( 4065 19 19 2 s 661 3580 0 ));
DATA(insert ( 4065 19 19 3 s 93 3580 0 ));
DATA(insert ( 4065 19 19 4 s 663 3580 0 ));
DATA(insert ( 4065 19 19 5 s 662 3580 0 ));
/* minmax bigint */
DATA(insert ( 4063 20 20 1 s 412 3580 0 ));
DATA(insert ( 4063 20 20 2 s 414 3580 0 ));
DATA(insert ( 4063 20 20 3 s 410 3580 0 ));
DATA(insert ( 4063 20 20 4 s 415 3580 0 ));
DATA(insert ( 4063 20 20 5 s 413 3580 0 ));
/* minmax smallint */
DATA(insert ( 4067 21 21 1 s 95 3580 0 ));
DATA(insert ( 4067 21 21 2 s 522 3580 0 ));
DATA(insert ( 4067 21 21 3 s 94 3580 0 ));
DATA(insert ( 4067 21 21 4 s 524 3580 0 ));
DATA(insert ( 4067 21 21 5 s 520 3580 0 ));
/* minmax integer */
DATA(insert ( 4054 23 23 1 s 97 3580 0 ));
DATA(insert ( 4054 23 23 2 s 523 3580 0 ));
DATA(insert ( 4054 23 23 3 s 96 3580 0 ));
DATA(insert ( 4054 23 23 4 s 525 3580 0 ));
DATA(insert ( 4054 23 23 5 s 521 3580 0 ));
/* minmax text */
DATA(insert ( 4056 25 25 1 s 664 3580 0 ));
DATA(insert ( 4056 25 25 2 s 665 3580 0 ));
DATA(insert ( 4056 25 25 3 s 98 3580 0 ));
DATA(insert ( 4056 25 25 4 s 667 3580 0 ));
DATA(insert ( 4056 25 25 5 s 666 3580 0 ));
/* minmax oid */
DATA(insert ( 4068 26 26 1 s 609 3580 0 ));
DATA(insert ( 4068 26 26 2 s 611 3580 0 ));
DATA(insert ( 4068 26 26 3 s 607 3580 0 ));
DATA(insert ( 4068 26 26 4 s 612 3580 0 ));
DATA(insert ( 4068 26 26 5 s 610 3580 0 ));
/* minmax tid */
DATA(insert ( 4069 27 27 1 s 2799 3580 0 ));
DATA(insert ( 4069 27 27 2 s 2801 3580 0 ));
DATA(insert ( 4069 27 27 3 s 387 3580 0 ));
DATA(insert ( 4069 27 27 4 s 2802 3580 0 ));
DATA(insert ( 4069 27 27 5 s 2800 3580 0 ));
/* minmax real */
DATA(insert ( 4070 700 700 1 s 622 3580 0 ));
DATA(insert ( 4070 700 700 2 s 624 3580 0 ));
DATA(insert ( 4070 700 700 3 s 620 3580 0 ));
DATA(insert ( 4070 700 700 4 s 625 3580 0 ));
DATA(insert ( 4070 700 700 5 s 623 3580 0 ));
/* minmax double precision */
DATA(insert ( 4071 701 701 1 s 672 3580 0 ));
DATA(insert ( 4071 701 701 2 s 673 3580 0 ));
DATA(insert ( 4071 701 701 3 s 670 3580 0 ));
DATA(insert ( 4071 701 701 4 s 675 3580 0 ));
DATA(insert ( 4071 701 701 5 s 674 3580 0 ));
/* minmax abstime */
DATA(insert ( 4072 702 702 1 s 562 3580 0 ));
DATA(insert ( 4072 702 702 2 s 564 3580 0 ));
DATA(insert ( 4072 702 702 3 s 560 3580 0 ));
DATA(insert ( 4072 702 702 4 s 565 3580 0 ));
DATA(insert ( 4072 702 702 5 s 563 3580 0 ));
/* minmax reltime */
DATA(insert ( 4073 703 703 1 s 568 3580 0 ));
DATA(insert ( 4073 703 703 2 s 570 3580 0 ));
DATA(insert ( 4073 703 703 3 s 566 3580 0 ));
DATA(insert ( 4073 703 703 4 s 571 3580 0 ));
DATA(insert ( 4073 703 703 5 s 569 3580 0 ));
/* minmax macaddr */
DATA(insert ( 4074 829 829 1 s 1222 3580 0 ));
DATA(insert ( 4074 829 829 2 s 1223 3580 0 ));
DATA(insert ( 4074 829 829 3 s 1220 3580 0 ));
DATA(insert ( 4074 829 829 4 s 1225 3580 0 ));
DATA(insert ( 4074 829 829 5 s 1224 3580 0 ));
/* minmax inet */
DATA(insert ( 4075 869 869 1 s 1203 3580 0 ));
DATA(insert ( 4075 869 869 2 s 1204 3580 0 ));
DATA(insert ( 4075 869 869 3 s 1201 3580 0 ));
DATA(insert ( 4075 869 869 4 s 1206 3580 0 ));
DATA(insert ( 4075 869 869 5 s 1205 3580 0 ));
/* minmax character */
DATA(insert ( 4076 1042 1042 1 s 1058 3580 0 ));
DATA(insert ( 4076 1042 1042 2 s 1059 3580 0 ));
DATA(insert ( 4076 1042 1042 3 s 1054 3580 0 ));
DATA(insert ( 4076 1042 1042 4 s 1061 3580 0 ));
DATA(insert ( 4076 1042 1042 5 s 1060 3580 0 ));
/* minmax date */
DATA(insert ( 4061 1082 1082 1 s 1095 3580 0 ));
DATA(insert ( 4061 1082 1082 2 s 1096 3580 0 ));
DATA(insert ( 4061 1082 1082 3 s 1093 3580 0 ));
DATA(insert ( 4061 1082 1082 4 s 1098 3580 0 ));
DATA(insert ( 4061 1082 1082 5 s 1097 3580 0 ));
/* minmax time without time zone */
DATA(insert ( 4077 1083 1083 1 s 1110 3580 0 ));
DATA(insert ( 4077 1083 1083 2 s 1111 3580 0 ));
DATA(insert ( 4077 1083 1083 3 s 1108 3580 0 ));
DATA(insert ( 4077 1083 1083 4 s 1113 3580 0 ));
DATA(insert ( 4077 1083 1083 5 s 1112 3580 0 ));
/* minmax timestamp without time zone */
DATA(insert ( 4059 1114 1114 1 s 2062 3580 0 ));
DATA(insert ( 4059 1114 1114 2 s 2063 3580 0 ));
DATA(insert ( 4059 1114 1114 3 s 2060 3580 0 ));
DATA(insert ( 4059 1114 1114 4 s 2065 3580 0 ));
DATA(insert ( 4059 1114 1114 5 s 2064 3580 0 ));
/* minmax timestamp with time zone */
DATA(insert ( 4060 1184 1184 1 s 1322 3580 0 ));
DATA(insert ( 4060 1184 1184 2 s 1323 3580 0 ));
DATA(insert ( 4060 1184 1184 3 s 1320 3580 0 ));
DATA(insert ( 4060 1184 1184 4 s 1325 3580 0 ));
DATA(insert ( 4060 1184 1184 5 s 1324 3580 0 ));
/* minmax interval */
DATA(insert ( 4078 1186 1186 1 s 1332 3580 0 ));
DATA(insert ( 4078 1186 1186 2 s 1333 3580 0 ));
DATA(insert ( 4078 1186 1186 3 s 1330 3580 0 ));
DATA(insert ( 4078 1186 1186 4 s 1335 3580 0 ));
DATA(insert ( 4078 1186 1186 5 s 1334 3580 0 ));
/* minmax time with time zone */
DATA(insert ( 4058 1266 1266 1 s 1552 3580 0 ));
DATA(insert ( 4058 1266 1266 2 s 1553 3580 0 ));
DATA(insert ( 4058 1266 1266 3 s 1550 3580 0 ));
DATA(insert ( 4058 1266 1266 4 s 1555 3580 0 ));
DATA(insert ( 4058 1266 1266 5 s 1554 3580 0 ));
/* minmax bit */
DATA(insert ( 4079 1560 1560 1 s 1786 3580 0 ));
DATA(insert ( 4079 1560 1560 2 s 1788 3580 0 ));
DATA(insert ( 4079 1560 1560 3 s 1784 3580 0 ));
DATA(insert ( 4079 1560 1560 4 s 1789 3580 0 ));
DATA(insert ( 4079 1560 1560 5 s 1787 3580 0 ));
/* minmax bit varying */
DATA(insert ( 4080 1562 1562 1 s 1806 3580 0 ));
DATA(insert ( 4080 1562 1562 2 s 1808 3580 0 ));
DATA(insert ( 4080 1562 1562 3 s 1804 3580 0 ));
DATA(insert ( 4080 1562 1562 4 s 1809 3580 0 ));
DATA(insert ( 4080 1562 1562 5 s 1807 3580 0 ));
/* minmax numeric */
DATA(insert ( 4055 1700 1700 1 s 1754 3580 0 ));
DATA(insert ( 4055 1700 1700 2 s 1755 3580 0 ));
DATA(insert ( 4055 1700 1700 3 s 1752 3580 0 ));
DATA(insert ( 4055 1700 1700 4 s 1757 3580 0 ));
DATA(insert ( 4055 1700 1700 5 s 1756 3580 0 ));
/* minmax uuid */
DATA(insert ( 4081 2950 2950 1 s 2974 3580 0 ));
DATA(insert ( 4081 2950 2950 2 s 2976 3580 0 ));
DATA(insert ( 4081 2950 2950 3 s 2972 3580 0 ));
DATA(insert ( 4081 2950 2950 4 s 2977 3580 0 ));
DATA(insert ( 4081 2950 2950 5 s 2975 3580 0 ));
/* minmax pg_lsn */
DATA(insert ( 4082 3220 3220 1 s 3224 3580 0 ));
DATA(insert ( 4082 3220 3220 2 s 3226 3580 0 ));
DATA(insert ( 4082 3220 3220 3 s 3222 3580 0 ));
DATA(insert ( 4082 3220 3220 4 s 3227 3580 0 ));
DATA(insert ( 4082 3220 3220 5 s 3225 3580 0 ));
#endif /* PG_AMOP_H */
......@@ -432,4 +432,249 @@ DATA(insert ( 4017 25 25 3 4029 ));
DATA(insert ( 4017 25 25 4 4030 ));
DATA(insert ( 4017 25 25 5 4031 ));
/* BRIN opclasses */
/* minmax bytea */
DATA(insert ( 4064 17 17 1 3383 ));
DATA(insert ( 4064 17 17 2 3384 ));
DATA(insert ( 4064 17 17 3 3385 ));
DATA(insert ( 4064 17 17 4 3386 ));
DATA(insert ( 4064 17 17 11 1949 ));
DATA(insert ( 4064 17 17 12 1950 ));
DATA(insert ( 4064 17 17 13 1952 ));
DATA(insert ( 4064 17 17 14 1951 ));
/* minmax "char" */
DATA(insert ( 4062 18 18 1 3383 ));
DATA(insert ( 4062 18 18 2 3384 ));
DATA(insert ( 4062 18 18 3 3385 ));
DATA(insert ( 4062 18 18 4 3386 ));
DATA(insert ( 4062 18 18 11 1246 ));
DATA(insert ( 4062 18 18 12 72 ));
DATA(insert ( 4062 18 18 13 74 ));
DATA(insert ( 4062 18 18 14 73 ));
/* minmax name */
DATA(insert ( 4065 19 19 1 3383 ));
DATA(insert ( 4065 19 19 2 3384 ));
DATA(insert ( 4065 19 19 3 3385 ));
DATA(insert ( 4065 19 19 4 3386 ));
DATA(insert ( 4065 19 19 11 655 ));
DATA(insert ( 4065 19 19 12 656 ));
DATA(insert ( 4065 19 19 13 658 ));
DATA(insert ( 4065 19 19 14 657 ));
/* minmax bigint */
DATA(insert ( 4063 20 20 1 3383 ));
DATA(insert ( 4063 20 20 2 3384 ));
DATA(insert ( 4063 20 20 3 3385 ));
DATA(insert ( 4063 20 20 4 3386 ));
DATA(insert ( 4063 20 20 11 469 ));
DATA(insert ( 4063 20 20 12 471 ));
DATA(insert ( 4063 20 20 13 472 ));
DATA(insert ( 4063 20 20 14 470 ));
/* minmax smallint */
DATA(insert ( 4067 21 21 1 3383 ));
DATA(insert ( 4067 21 21 2 3384 ));
DATA(insert ( 4067 21 21 3 3385 ));
DATA(insert ( 4067 21 21 4 3386 ));
DATA(insert ( 4067 21 21 11 64 ));
DATA(insert ( 4067 21 21 12 148 ));
DATA(insert ( 4067 21 21 13 151 ));
DATA(insert ( 4067 21 21 14 146 ));
/* minmax integer */
DATA(insert ( 4054 23 23 1 3383 ));
DATA(insert ( 4054 23 23 2 3384 ));
DATA(insert ( 4054 23 23 3 3385 ));
DATA(insert ( 4054 23 23 4 3386 ));
DATA(insert ( 4054 23 23 11 66 ));
DATA(insert ( 4054 23 23 12 149 ));
DATA(insert ( 4054 23 23 13 150 ));
DATA(insert ( 4054 23 23 14 147 ));
/* minmax text */
DATA(insert ( 4056 25 25 1 3383 ));
DATA(insert ( 4056 25 25 2 3384 ));
DATA(insert ( 4056 25 25 3 3385 ));
DATA(insert ( 4056 25 25 4 3386 ));
DATA(insert ( 4056 25 25 11 740 ));
DATA(insert ( 4056 25 25 12 741 ));
DATA(insert ( 4056 25 25 13 743 ));
DATA(insert ( 4056 25 25 14 742 ));
/* minmax oid */
DATA(insert ( 4068 26 26 1 3383 ));
DATA(insert ( 4068 26 26 2 3384 ));
DATA(insert ( 4068 26 26 3 3385 ));
DATA(insert ( 4068 26 26 4 3386 ));
DATA(insert ( 4068 26 26 11 716 ));
DATA(insert ( 4068 26 26 12 717 ));
DATA(insert ( 4068 26 26 13 1639 ));
DATA(insert ( 4068 26 26 14 1638 ));
/* minmax tid */
DATA(insert ( 4069 27 27 1 3383 ));
DATA(insert ( 4069 27 27 2 3384 ));
DATA(insert ( 4069 27 27 3 3385 ));
DATA(insert ( 4069 27 27 4 3386 ));
DATA(insert ( 4069 27 27 11 2791 ));
DATA(insert ( 4069 27 27 12 2793 ));
DATA(insert ( 4069 27 27 13 2792 ));
DATA(insert ( 4069 27 27 14 2790 ));
/* minmax real */
DATA(insert ( 4070 700 700 1 3383 ));
DATA(insert ( 4070 700 700 2 3384 ));
DATA(insert ( 4070 700 700 3 3385 ));
DATA(insert ( 4070 700 700 4 3386 ));
DATA(insert ( 4070 700 700 11 289 ));
DATA(insert ( 4070 700 700 12 290 ));
DATA(insert ( 4070 700 700 13 292 ));
DATA(insert ( 4070 700 700 14 291 ));
/* minmax double precision */
DATA(insert ( 4071 701 701 1 3383 ));
DATA(insert ( 4071 701 701 2 3384 ));
DATA(insert ( 4071 701 701 3 3385 ));
DATA(insert ( 4071 701 701 4 3386 ));
DATA(insert ( 4071 701 701 11 295 ));
DATA(insert ( 4071 701 701 12 296 ));
DATA(insert ( 4071 701 701 13 298 ));
DATA(insert ( 4071 701 701 14 297 ));
/* minmax abstime */
DATA(insert ( 4072 702 702 1 3383 ));
DATA(insert ( 4072 702 702 2 3384 ));
DATA(insert ( 4072 702 702 3 3385 ));
DATA(insert ( 4072 702 702 4 3386 ));
DATA(insert ( 4072 702 702 11 253 ));
DATA(insert ( 4072 702 702 12 255 ));
DATA(insert ( 4072 702 702 13 256 ));
DATA(insert ( 4072 702 702 14 254 ));
/* minmax reltime */
DATA(insert ( 4073 703 703 1 3383 ));
DATA(insert ( 4073 703 703 2 3384 ));
DATA(insert ( 4073 703 703 3 3385 ));
DATA(insert ( 4073 703 703 4 3386 ));
DATA(insert ( 4073 703 703 11 259 ));
DATA(insert ( 4073 703 703 12 261 ));
DATA(insert ( 4073 703 703 13 262 ));
DATA(insert ( 4073 703 703 14 260 ));
/* minmax macaddr */
DATA(insert ( 4074 829 829 1 3383 ));
DATA(insert ( 4074 829 829 2 3384 ));
DATA(insert ( 4074 829 829 3 3385 ));
DATA(insert ( 4074 829 829 4 3386 ));
DATA(insert ( 4074 829 829 11 831 ));
DATA(insert ( 4074 829 829 12 832 ));
DATA(insert ( 4074 829 829 13 834 ));
DATA(insert ( 4074 829 829 14 833 ));
/* minmax inet */
DATA(insert ( 4075 869 869 1 3383 ));
DATA(insert ( 4075 869 869 2 3384 ));
DATA(insert ( 4075 869 869 3 3385 ));
DATA(insert ( 4075 869 869 4 3386 ));
DATA(insert ( 4075 869 869 11 921 ));
DATA(insert ( 4075 869 869 12 922 ));
DATA(insert ( 4075 869 869 13 924 ));
DATA(insert ( 4075 869 869 14 923 ));
/* minmax character */
DATA(insert ( 4076 1042 1042 1 3383 ));
DATA(insert ( 4076 1042 1042 2 3384 ));
DATA(insert ( 4076 1042 1042 3 3385 ));
DATA(insert ( 4076 1042 1042 4 3386 ));
DATA(insert ( 4076 1042 1042 11 1049 ));
DATA(insert ( 4076 1042 1042 12 1050 ));
DATA(insert ( 4076 1042 1042 13 1052 ));
DATA(insert ( 4076 1042 1042 14 1051 ));
/* minmax date */
DATA(insert ( 4061 1082 1082 1 3383 ));
DATA(insert ( 4061 1082 1082 2 3384 ));
DATA(insert ( 4061 1082 1082 3 3385 ));
DATA(insert ( 4061 1082 1082 4 3386 ));
DATA(insert ( 4061 1082 1082 11 1087 ));
DATA(insert ( 4061 1082 1082 12 1088 ));
DATA(insert ( 4061 1082 1082 13 1090 ));
DATA(insert ( 4061 1082 1082 14 1089 ));
/* minmax time without time zone */
DATA(insert ( 4077 1083 1083 1 3383 ));
DATA(insert ( 4077 1083 1083 2 3384 ));
DATA(insert ( 4077 1083 1083 3 3385 ));
DATA(insert ( 4077 1083 1083 4 3386 ));
DATA(insert ( 4077 1083 1083 11 1102 ));
DATA(insert ( 4077 1083 1083 12 1103 ));
DATA(insert ( 4077 1083 1083 13 1105 ));
DATA(insert ( 4077 1083 1083 14 1104 ));
/* minmax timestamp without time zone */
DATA(insert ( 4059 1114 1114 1 3383 ));
DATA(insert ( 4059 1114 1114 2 3384 ));
DATA(insert ( 4059 1114 1114 3 3385 ));
DATA(insert ( 4059 1114 1114 4 3386 ));
DATA(insert ( 4059 1114 1114 11 2054 ));
DATA(insert ( 4059 1114 1114 12 2055 ));
DATA(insert ( 4059 1114 1114 13 2056 ));
DATA(insert ( 4059 1114 1114 14 2057 ));
/* minmax timestamp with time zone */
DATA(insert ( 4060 1184 1184 1 3383 ));
DATA(insert ( 4060 1184 1184 2 3384 ));
DATA(insert ( 4060 1184 1184 3 3385 ));
DATA(insert ( 4060 1184 1184 4 3386 ));
DATA(insert ( 4060 1184 1184 11 1154 ));
DATA(insert ( 4060 1184 1184 12 1155 ));
DATA(insert ( 4060 1184 1184 13 1156 ));
DATA(insert ( 4060 1184 1184 14 1157 ));
/* minmax interval */
DATA(insert ( 4078 1186 1186 1 3383 ));
DATA(insert ( 4078 1186 1186 2 3384 ));
DATA(insert ( 4078 1186 1186 3 3385 ));
DATA(insert ( 4078 1186 1186 4 3386 ));
DATA(insert ( 4078 1186 1186 11 1164 ));
DATA(insert ( 4078 1186 1186 12 1165 ));
DATA(insert ( 4078 1186 1186 13 1166 ));
DATA(insert ( 4078 1186 1186 14 1167 ));
/* minmax time with time zone */
DATA(insert ( 4058 1266 1266 1 3383 ));
DATA(insert ( 4058 1266 1266 2 3384 ));
DATA(insert ( 4058 1266 1266 3 3385 ));
DATA(insert ( 4058 1266 1266 4 3386 ));
DATA(insert ( 4058 1266 1266 11 1354 ));
DATA(insert ( 4058 1266 1266 12 1355 ));
DATA(insert ( 4058 1266 1266 13 1356 ));
DATA(insert ( 4058 1266 1266 14 1357 ));
/* minmax bit */
DATA(insert ( 4079 1560 1560 1 3383 ));
DATA(insert ( 4079 1560 1560 2 3384 ));
DATA(insert ( 4079 1560 1560 3 3385 ));
DATA(insert ( 4079 1560 1560 4 3386 ));
DATA(insert ( 4079 1560 1560 11 1595 ));
DATA(insert ( 4079 1560 1560 12 1594 ));
DATA(insert ( 4079 1560 1560 13 1592 ));
DATA(insert ( 4079 1560 1560 14 1593 ));
/* minmax bit varying */
DATA(insert ( 4080 1562 1562 1 3383 ));
DATA(insert ( 4080 1562 1562 2 3384 ));
DATA(insert ( 4080 1562 1562 3 3385 ));
DATA(insert ( 4080 1562 1562 4 3386 ));
DATA(insert ( 4080 1562 1562 11 1671 ));
DATA(insert ( 4080 1562 1562 12 1670 ));
DATA(insert ( 4080 1562 1562 13 1668 ));
DATA(insert ( 4080 1562 1562 14 1669 ));
/* minmax numeric */
DATA(insert ( 4055 1700 1700 1 3383 ));
DATA(insert ( 4055 1700 1700 2 3384 ));
DATA(insert ( 4055 1700 1700 3 3385 ));
DATA(insert ( 4055 1700 1700 4 3386 ));
DATA(insert ( 4055 1700 1700 11 1722 ));
DATA(insert ( 4055 1700 1700 12 1723 ));
DATA(insert ( 4055 1700 1700 13 1721 ));
DATA(insert ( 4055 1700 1700 14 1720 ));
/* minmax uuid */
DATA(insert ( 4081 2950 2950 1 3383 ));
DATA(insert ( 4081 2950 2950 2 3384 ));
DATA(insert ( 4081 2950 2950 3 3385 ));
DATA(insert ( 4081 2950 2950 4 3386 ));
DATA(insert ( 4081 2950 2950 11 2954 ));
DATA(insert ( 4081 2950 2950 12 2955 ));
DATA(insert ( 4081 2950 2950 13 2957 ));
DATA(insert ( 4081 2950 2950 14 2958 ));
/* minmax pg_lsn */
DATA(insert ( 4082 3220 3220 1 3383 ));
DATA(insert ( 4082 3220 3220 2 3384 ));
DATA(insert ( 4082 3220 3220 3 3385 ));
DATA(insert ( 4082 3220 3220 4 3386 ));
DATA(insert ( 4082 3220 3220 11 3231 ));
DATA(insert ( 4082 3220 3220 12 3232 ));
DATA(insert ( 4082 3220 3220 13 3234 ));
DATA(insert ( 4082 3220 3220 14 3235 ));
#endif /* PG_AMPROC_H */
......@@ -236,4 +236,36 @@ DATA(insert ( 405 jsonb_ops PGNSP PGUID 4034 3802 t 0 ));
DATA(insert ( 2742 jsonb_ops PGNSP PGUID 4036 3802 t 25 ));
DATA(insert ( 2742 jsonb_path_ops PGNSP PGUID 4037 3802 f 23 ));
/* BRIN operator classes */
/* no brin opclass for bool */
DATA(insert ( 3580 bytea_minmax_ops PGNSP PGUID 4064 17 t 0 ));
DATA(insert ( 3580 char_minmax_ops PGNSP PGUID 4062 18 t 0 ));
DATA(insert ( 3580 name_minmax_ops PGNSP PGUID 4065 19 t 0 ));
DATA(insert ( 3580 int8_minmax_ops PGNSP PGUID 4063 20 t 0 ));
DATA(insert ( 3580 int2_minmax_ops PGNSP PGUID 4067 21 t 0 ));
DATA(insert ( 3580 int4_minmax_ops PGNSP PGUID 4054 23 t 0 ));
DATA(insert ( 3580 text_minmax_ops PGNSP PGUID 4056 25 t 0 ));
DATA(insert ( 3580 oid_minmax_ops PGNSP PGUID 4068 26 t 0 ));
DATA(insert ( 3580 tid_minmax_ops PGNSP PGUID 4069 27 t 0 ));
DATA(insert ( 3580 float4_minmax_ops PGNSP PGUID 4070 700 t 0 ));
DATA(insert ( 3580 float8_minmax_ops PGNSP PGUID 4071 701 t 0 ));
DATA(insert ( 3580 abstime_minmax_ops PGNSP PGUID 4072 702 t 0 ));
DATA(insert ( 3580 reltime_minmax_ops PGNSP PGUID 4073 703 t 0 ));
DATA(insert ( 3580 macaddr_minmax_ops PGNSP PGUID 4074 829 t 0 ));
DATA(insert ( 3580 inet_minmax_ops PGNSP PGUID 4075 869 t 0 ));
DATA(insert ( 3580 bpchar_minmax_ops PGNSP PGUID 4076 1042 t 0 ));
DATA(insert ( 3580 date_minmax_ops PGNSP PGUID 4061 1082 t 0 ));
DATA(insert ( 3580 time_minmax_ops PGNSP PGUID 4077 1083 t 0 ));
DATA(insert ( 3580 timestamp_minmax_ops PGNSP PGUID 4059 1114 t 0 ));
DATA(insert ( 3580 timestamptz_minmax_ops PGNSP PGUID 4060 1184 t 0 ));
DATA(insert ( 3580 interval_minmax_ops PGNSP PGUID 4078 1186 t 0 ));
DATA(insert ( 3580 timetz_minmax_ops PGNSP PGUID 4058 1266 t 0 ));
DATA(insert ( 3580 bit_minmax_ops PGNSP PGUID 4079 1560 t 0 ));
DATA(insert ( 3580 varbit_minmax_ops PGNSP PGUID 4080 1562 t 0 ));
DATA(insert ( 3580 numeric_minmax_ops PGNSP PGUID 4055 1700 t 0 ));
/* no brin opclass for record, anyarray */
DATA(insert ( 3580 uuid_minmax_ops PGNSP PGUID 4081 2950 t 0 ));
DATA(insert ( 3580 pg_lsn_minmax_ops PGNSP PGUID 4082 3220 t 0 ));
/* no brin opclass for enum, tsvector, tsquery, jsonb, range */
#endif /* PG_OPCLASS_H */
......@@ -157,4 +157,32 @@ DATA(insert OID = 4035 ( 783 jsonb_ops PGNSP PGUID ));
DATA(insert OID = 4036 ( 2742 jsonb_ops PGNSP PGUID ));
DATA(insert OID = 4037 ( 2742 jsonb_path_ops PGNSP PGUID ));
DATA(insert OID = 4054 ( 3580 int4_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4055 ( 3580 numeric_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4056 ( 3580 text_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4058 ( 3580 timetz_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4059 ( 3580 timestamp_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4060 ( 3580 timestamptz_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4061 ( 3580 date_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4062 ( 3580 char_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4063 ( 3580 int8_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4064 ( 3580 bytea_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4065 ( 3580 name_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4067 ( 3580 int2_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4068 ( 3580 oid_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4069 ( 3580 tid_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4070 ( 3580 float4_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4071 ( 3580 float8_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4072 ( 3580 abstime_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4073 ( 3580 reltime_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4074 ( 3580 macaddr_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4075 ( 3580 inet_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4076 ( 3580 bpchar_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4077 ( 3580 time_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4078 ( 3580 interval_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4079 ( 3580 bit_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4080 ( 3580 varbit_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4081 ( 3580 uuid_minmax_ops PGNSP PGUID ));
DATA(insert OID = 4082 ( 3580 pg_lsn_minmax_ops PGNSP PGUID ));
#endif /* PG_OPFAMILY_H */
......@@ -565,6 +565,35 @@ DESCR("btree(internal)");
DATA(insert OID = 2785 ( btoptions PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_ btoptions _null_ _null_ _null_ ));
DESCR("btree(internal)");
DATA(insert OID = 3789 ( bringetbitmap PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 20 "2281 2281" _null_ _null_ _null_ _null_ bringetbitmap _null_ _null_ _null_ ));
DESCR("brin(internal)");
DATA(insert OID = 3790 ( brininsert PGNSP PGUID 12 1 0 0 0 f f f f t f v 6 0 16 "2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ brininsert _null_ _null_ _null_ ));
DESCR("brin(internal)");
DATA(insert OID = 3791 ( brinbeginscan PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 2281 "2281 2281 2281" _null_ _null_ _null_ _null_ brinbeginscan _null_ _null_ _null_ ));
DESCR("brin(internal)");
DATA(insert OID = 3792 ( brinrescan PGNSP PGUID 12 1 0 0 0 f f f f t f v 5 0 2278 "2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ brinrescan _null_ _null_ _null_ ));
DESCR("brin(internal)");
DATA(insert OID = 3793 ( brinendscan PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ brinendscan _null_ _null_ _null_ ));
DESCR("brin(internal)");
DATA(insert OID = 3794 ( brinmarkpos PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ brinmarkpos _null_ _null_ _null_ ));
DESCR("brin(internal)");
DATA(insert OID = 3795 ( brinrestrpos PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ brinrestrpos _null_ _null_ _null_ ));
DESCR("brin(internal)");
DATA(insert OID = 3796 ( brinbuild PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 2281 "2281 2281 2281" _null_ _null_ _null_ _null_ brinbuild _null_ _null_ _null_ ));
DESCR("brin(internal)");
DATA(insert OID = 3797 ( brinbuildempty PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ brinbuildempty _null_ _null_ _null_ ));
DESCR("brin(internal)");
DATA(insert OID = 3798 ( brinbulkdelete PGNSP PGUID 12 1 0 0 0 f f f f t f v 4 0 2281 "2281 2281 2281 2281" _null_ _null_ _null_ _null_ brinbulkdelete _null_ _null_ _null_ ));
DESCR("brin(internal)");
DATA(insert OID = 3799 ( brinvacuumcleanup PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 2281 "2281 2281" _null_ _null_ _null_ _null_ brinvacuumcleanup _null_ _null_ _null_ ));
DESCR("brin(internal)");
DATA(insert OID = 3800 ( brincostestimate PGNSP PGUID 12 1 0 0 0 f f f f t f v 7 0 2278 "2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ brincostestimate _null_ _null_ _null_ ));
DESCR("brin(internal)");
DATA(insert OID = 3801 ( brinoptions PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_ brinoptions _null_ _null_ _null_ ));
DESCR("brin(internal)");
DATA(insert OID = 3952 ( brin_summarize_new_values PGNSP PGUID 12 1 0 0 0 f f f f f f v 1 0 23 "2205" _null_ _null_ _null_ _null_ brin_summarize_new_values _null_ _null_ _null_ ));
DESCR("brin: standalone scan new table pages");
DATA(insert OID = 339 ( poly_same PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_same _null_ _null_ _null_ ));
DATA(insert OID = 340 ( poly_contain PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_contain _null_ _null_ _null_ ));
DATA(insert OID = 341 ( poly_left PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_left _null_ _null_ _null_ ));
......@@ -4078,6 +4107,16 @@ DATA(insert OID = 2747 ( arrayoverlap PGNSP PGUID 12 1 0 0 0 f f f f t f i
DATA(insert OID = 2748 ( arraycontains PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "2277 2277" _null_ _null_ _null_ _null_ arraycontains _null_ _null_ _null_ ));
DATA(insert OID = 2749 ( arraycontained PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "2277 2277" _null_ _null_ _null_ _null_ arraycontained _null_ _null_ _null_ ));
/* BRIN minmax */
DATA(insert OID = 3383 ( brin_minmax_opcinfo PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 2281 "2281" _null_ _null_ _null_ _null_ minmaxOpcInfo _null_ _null_ _null_ ));
DESCR("BRIN minmax support");
DATA(insert OID = 3384 ( brin_minmax_add_value PGNSP PGUID 12 1 0 0 0 f f f f t f i 4 0 16 "2281 2281 2281 2281" _null_ _null_ _null_ _null_ minmaxAddValue _null_ _null_ _null_ ));
DESCR("BRIN minmax support");
DATA(insert OID = 3385 ( brin_minmax_consistent PGNSP PGUID 12 1 0 0 0 f f f f t f i 3 0 16 "2281 2281 2281" _null_ _null_ _null_ _null_ minmaxConsistent _null_ _null_ _null_ ));
DESCR("BRIN minmax support");
DATA(insert OID = 3386 ( brin_minmax_union PGNSP PGUID 12 1 0 0 0 f f f f t f i 3 0 16 "2281 2281 2281" _null_ _null_ _null_ _null_ minmaxUnion _null_ _null_ _null_ ));
DESCR("BRIN minmax support");
/* userlock replacements */
DATA(insert OID = 2880 ( pg_advisory_lock PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "20" _null_ _null_ _null_ _null_ pg_advisory_lock_int8 _null_ _null_ _null_ ));
DESCR("obtain exclusive advisory lock");
......
......@@ -403,6 +403,8 @@ extern Size PageGetExactFreeSpace(Page page);
extern Size PageGetHeapFreeSpace(Page page);
extern void PageIndexTupleDelete(Page page, OffsetNumber offset);
extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems);
extern void PageIndexDeleteNoCompact(Page page, OffsetNumber *itemnos,
int nitems);
extern char *PageSetChecksumCopy(Page page, BlockNumber blkno);
extern void PageSetChecksumInplace(Page page, BlockNumber blkno);
......
......@@ -190,6 +190,7 @@ extern double estimate_num_groups(PlannerInfo *root, List *groupExprs,
extern Selectivity estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey,
double nbuckets);
extern Datum brincostestimate(PG_FUNCTION_ARGS);
extern Datum btcostestimate(PG_FUNCTION_ARGS);
extern Datum hashcostestimate(PG_FUNCTION_ARGS);
extern Datum gistcostestimate(PG_FUNCTION_ARGS);
......
SET synchronous_commit = 0;
CREATE TABLE brintest (byteacol bytea,
charcol "char",
namecol name,
int8col bigint,
int2col smallint,
int4col integer,
textcol text,
oidcol oid,
tidcol tid,
float4col real,
float8col double precision,
macaddrcol macaddr,
inetcol inet,
bpcharcol character,
datecol date,
timecol time without time zone,
timestampcol timestamp without time zone,
timestamptzcol timestamp with time zone,
intervalcol interval,
timetzcol time with time zone,
bitcol bit(10),
varbitcol bit varying(16),
numericcol numeric,
uuidcol uuid,
lsncol pg_lsn
) WITH (fillfactor=50);
INSERT INTO brintest SELECT
repeat(stringu1, 42)::bytea,
substr(stringu1, 1, 1)::"char",
stringu1::name, 142857 * tenthous,
thousand,
twothousand,
repeat(stringu1, 42),
unique1::oid,
format('(%s,%s)', tenthous, twenty)::tid,
(four + 1.0)/(hundred+1),
odd::float8 / (tenthous + 1),
format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr,
inet '10.2.3.4' + tenthous,
substr(stringu1, 1, 1)::bpchar,
date '1995-08-15' + tenthous,
time '01:20:30' + thousand * interval '18.5 second',
timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours',
timestamptz '1972-10-10 03:00' + thousand * interval '1 hour',
justify_days(justify_hours(tenthous * interval '12 minutes')),
timetz '01:30:20' + hundred * interval '15 seconds',
thousand::bit(10),
tenthous::bit(16)::varbit,
tenthous::numeric(36,30) * fivethous * even / (hundred + 1),
format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid,
format('%s/%s%s', odd, even, tenthous)::pg_lsn
FROM tenk1;
CREATE INDEX brinidx ON brintest USING brin (
byteacol,
charcol,
namecol,
int8col,
int2col,
int4col,
textcol,
oidcol,
tidcol,
float4col,
float8col,
macaddrcol,
inetcol,
bpcharcol,
datecol,
timecol,
timestampcol,
timestamptzcol,
intervalcol,
timetzcol,
bitcol,
varbitcol,
numericcol,
uuidcol,
lsncol
) with (pages_per_range = 1);
CREATE TABLE brinopers (colname name, op text[], value text[],
check (cardinality(op) = cardinality(value)));
INSERT INTO brinopers VALUES ('byteacol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}');
INSERT INTO brinopers VALUES ('charcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}');
INSERT INTO brinopers VALUES ('namecol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}');
INSERT INTO brinopers VALUES ('int8col', '{>, >=, =, <=, <}', '{1428427143, 1428427143, 0, 0, 0}');
INSERT INTO brinopers VALUES ('int2col', '{>, >=, =, <=, <}', '{999, 999, 0, 0, 0}');
INSERT INTO brinopers VALUES ('int4col', '{>, >=, =, <=, <}', '{1999, 1999, 0, 0, 0}');
INSERT INTO brinopers VALUES ('textcol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAA, AAAAA, AAAAA}');
INSERT INTO brinopers VALUES ('oidcol', '{>, >=, =, <=, <}', '{9999, 9999, 0, 0, 0}');
INSERT INTO brinopers VALUES ('tidcol', '{>, >=, =, <=, <}', '{"(9999,19)", "(9999,19)", "(0,0)", "(0,0)", "(0,0)"}');
INSERT INTO brinopers VALUES ('float4col', '{>, >=, =, <=, <}', '{1, 1, 0.0103093, 0.0103093, 0.0103093}');
INSERT INTO brinopers VALUES ('float8col', '{>, >=, =, <=, <}', '{1.98, 1.98, 0, 0, 0}');
INSERT INTO brinopers VALUES ('inetcol', '{>, >=, =, <=, <}', '{10.2.42.19, 10.2.42.19, 10.2.3.4, 10.2.3.4, 10.2.3.4}');
INSERT INTO brinopers VALUES ('bpcharcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}');
INSERT INTO brinopers VALUES ('datecol', '{>, >=, =, <=, <}', '{2022-12-30, 2022-12-30, 1995-08-15, 1995-08-15, 1995-08-15}');
INSERT INTO brinopers VALUES ('timecol', '{>, >=, =, <=, <}', '{06:28:31.5, 06:28:31.5, 01:20:30, 01:20:30, 01:20:30}');
INSERT INTO brinopers VALUES ('timestampcol', '{>, >=, =, <=, <}', '{1984-01-20 22:42:21, 1984-01-20 22:42:21, 1942-07-23 03:05:09, 1942-07-23 03:05:09, 1942-07-23 03:05:09}');
INSERT INTO brinopers VALUES ('timestamptzcol', '{>, >=, =, <=, <}', '{1972-11-20 19:00:00-03, 1972-11-20 19:00:00-03, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04}');
INSERT INTO brinopers VALUES ('intervalcol', '{>, >=, =, <=, <}', '{2 mons 23 days 07:48:00, 2 mons 23 days 07:48:00, 00:00:00, 00:00:00, 00:00:00}');
INSERT INTO brinopers VALUES ('timetzcol', '{>, >=, =, <=, <}', '{01:55:05-03, 01:55:05-03, 01:30:20-03, 01:30:20-03, 01:30:20-03}');
INSERT INTO brinopers VALUES ('numericcol', '{>, >=, =, <=, <}', '{99470151.9, 99470151.9, 0.00, 0.01, 0.01}');
INSERT INTO brinopers VALUES ('macaddrcol', '{>, >=, =, <=, <}', '{ff:fe:00:00:00:00, ff:fe:00:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00}');
INSERT INTO brinopers VALUES ('bitcol', '{>, >=, =, <=, <}', '{1111111000, 1111111000, 0000000010, 0000000010, 0000000010}');
INSERT INTO brinopers VALUES ('varbitcol', '{>, >=, =, <=, <}', '{1111111111111000, 1111111111111000, 0000000000000100, 0000000000000100, 0000000000000100}');
INSERT INTO brinopers VALUES ('uuidcol', '{>, >=, =, <=, <}', '{99989998-9998-9998-9998-999899989998, 99989998-9998-9998-9998-999899989998, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040005}');
INSERT INTO brinopers VALUES ('lsncol', '{>, >=, =, <=, <}', '{198/1999799, 198/1999799, 30/312815, 0/1200, 0/1200}');
DO $x$
DECLARE
r record;
tabname text;
tabname_ss text;
count int;
query text;
plan text;
BEGIN
FOR r IN SELECT row_number() OVER (), colname, oper, value[ordinality] FROM brinopers, unnest(op) WITH ORDINALITY AS oper LOOP
tabname := format('qry_%s', r.row_number);
tabname_ss := tabname || '_ss';
query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$,
tabname, r.colname, r.oper, r.value);
-- run the query using the brin index
SET enable_seqscan = 0;
SET enable_bitmapscan = 1;
EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP*/', tabname);
EXECUTE query;
-- run the query using a seqscan
SET enable_seqscan = 1;
SET enable_bitmapscan = 0;
query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$,
tabname_ss, r.colname, r.oper, r.value);
EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP */', tabname_ss);
EXECUTE query;
-- make sure both return the same results
EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname, tabname_ss);
GET DIAGNOSTICS count = ROW_COUNT;
IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF;
EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname_ss, tabname);
GET DIAGNOSTICS count = ROW_COUNT;
IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF;
end loop;
end;
$x$;
INSERT INTO brintest SELECT
repeat(stringu1, 42)::bytea,
substr(stringu1, 1, 1)::"char",
stringu1::name, 142857 * tenthous,
thousand,
twothousand,
repeat(stringu1, 42),
unique1::oid,
format('(%s,%s)', tenthous, twenty)::tid,
(four + 1.0)/(hundred+1),
odd::float8 / (tenthous + 1),
format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr,
inet '10.2.3.4' + tenthous,
substr(stringu1, 1, 1)::bpchar,
date '1995-08-15' + tenthous,
time '01:20:30' + thousand * interval '18.5 second',
timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours',
timestamptz '1972-10-10 03:00' + thousand * interval '1 hour',
justify_days(justify_hours(tenthous * interval '12 minutes')),
timetz '01:30:20' + hundred * interval '15 seconds',
thousand::bit(10),
tenthous::bit(16)::varbit,
tenthous::numeric(36,30) * fivethous * even / (hundred + 1),
format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid,
format('%s/%s%s', odd, even, tenthous)::pg_lsn
FROM tenk1;
SELECT brin_summarize_new_values('brinidx'::regclass);
brin_summarize_new_values
---------------------------
2000
(1 row)
UPDATE brintest SET int8col = int8col * int4col;
SET synchronous_commit = 1;
......@@ -1658,6 +1658,11 @@ ORDER BY 1, 2, 3;
2742 | 9 | ?
2742 | 10 | ?|
2742 | 11 | ?&
3580 | 1 | <
3580 | 2 | <=
3580 | 3 | =
3580 | 4 | >=
3580 | 5 | >
4000 | 1 | <<
4000 | 1 | ~<~
4000 | 2 | &<
......@@ -1680,7 +1685,7 @@ ORDER BY 1, 2, 3;
4000 | 15 | >
4000 | 16 | @>
4000 | 18 | =
(80 rows)
(85 rows)
-- Check that all opclass search operators have selectivity estimators.
-- This is not absolutely required, but it seems a reasonable thing
......@@ -1842,11 +1847,13 @@ WHERE NOT (
-- GIN has six support functions. 1-3 are mandatory, 5 is optional, and
-- at least one of 4 and 6 must be given.
-- SP-GiST has five support functions, all mandatory
-- BRIN has four mandatory support functions, and a bunch of optionals
amname = 'btree' AND procnums @> '{1}' OR
amname = 'hash' AND procnums = '{1}' OR
amname = 'gist' AND procnums @> '{1, 2, 3, 4, 5, 6, 7}' OR
amname = 'gin' AND (procnums @> '{1, 2, 3}' AND (procnums && '{4, 6}')) OR
amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}'
amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' OR
amname = 'brin' AND procnums @> '{1, 2, 3, 4}'
);
amname | opfname | amproclefttype | amprocrighttype | procnums
--------+---------+----------------+-----------------+----------
......@@ -1867,7 +1874,8 @@ WHERE NOT (
amname = 'hash' AND procnums = '{1}' OR
amname = 'gist' AND procnums @> '{1, 2, 3, 4, 5, 6, 7}' OR
amname = 'gin' AND (procnums @> '{1, 2, 3}' AND (procnums && '{4, 6}')) OR
amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}'
amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' OR
amname = 'brin' AND procnums @> '{1, 2, 3, 4}'
);
amname | opcname | procnums
--------+---------+----------
......
......@@ -591,6 +591,8 @@ SELECT user_relns() AS user_relns
bb
box_tbl
bprime
brinopers
brintest
bt_f8_heap
bt_i4_heap
bt_name_heap
......@@ -698,7 +700,7 @@ SELECT user_relns() AS user_relns
tvvmv
varchar_tbl
xacttest
(120 rows)
(122 rows)
SELECT name(equipment(hobby_construct(text 'skywalking', text 'mer')));
name
......
......@@ -83,7 +83,7 @@ test: select_into select_distinct select_distinct_on select_implicit select_havi
# ----------
# Another group of parallel tests
# ----------
test: privileges security_label collate matview lock replica_identity rowsecurity
test: brin privileges security_label collate matview lock replica_identity rowsecurity
# ----------
# Another group of parallel tests
......
......@@ -106,6 +106,7 @@ test: alter_generic
test: misc
test: psql
test: async
test: brin
test: rules
test: event_trigger
test: select_views
......
SET synchronous_commit = 0;
CREATE TABLE brintest (byteacol bytea,
charcol "char",
namecol name,
int8col bigint,
int2col smallint,
int4col integer,
textcol text,
oidcol oid,
tidcol tid,
float4col real,
float8col double precision,
macaddrcol macaddr,
inetcol inet,
bpcharcol character,
datecol date,
timecol time without time zone,
timestampcol timestamp without time zone,
timestamptzcol timestamp with time zone,
intervalcol interval,
timetzcol time with time zone,
bitcol bit(10),
varbitcol bit varying(16),
numericcol numeric,
uuidcol uuid,
lsncol pg_lsn
) WITH (fillfactor=50);
INSERT INTO brintest SELECT
repeat(stringu1, 42)::bytea,
substr(stringu1, 1, 1)::"char",
stringu1::name, 142857 * tenthous,
thousand,
twothousand,
repeat(stringu1, 42),
unique1::oid,
format('(%s,%s)', tenthous, twenty)::tid,
(four + 1.0)/(hundred+1),
odd::float8 / (tenthous + 1),
format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr,
inet '10.2.3.4' + tenthous,
substr(stringu1, 1, 1)::bpchar,
date '1995-08-15' + tenthous,
time '01:20:30' + thousand * interval '18.5 second',
timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours',
timestamptz '1972-10-10 03:00' + thousand * interval '1 hour',
justify_days(justify_hours(tenthous * interval '12 minutes')),
timetz '01:30:20' + hundred * interval '15 seconds',
thousand::bit(10),
tenthous::bit(16)::varbit,
tenthous::numeric(36,30) * fivethous * even / (hundred + 1),
format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid,
format('%s/%s%s', odd, even, tenthous)::pg_lsn
FROM tenk1;
CREATE INDEX brinidx ON brintest USING brin (
byteacol,
charcol,
namecol,
int8col,
int2col,
int4col,
textcol,
oidcol,
tidcol,
float4col,
float8col,
macaddrcol,
inetcol,
bpcharcol,
datecol,
timecol,
timestampcol,
timestamptzcol,
intervalcol,
timetzcol,
bitcol,
varbitcol,
numericcol,
uuidcol,
lsncol
) with (pages_per_range = 1);
CREATE TABLE brinopers (colname name, op text[], value text[],
check (cardinality(op) = cardinality(value)));
INSERT INTO brinopers VALUES ('byteacol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}');
INSERT INTO brinopers VALUES ('charcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}');
INSERT INTO brinopers VALUES ('namecol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}');
INSERT INTO brinopers VALUES ('int8col', '{>, >=, =, <=, <}', '{1428427143, 1428427143, 0, 0, 0}');
INSERT INTO brinopers VALUES ('int2col', '{>, >=, =, <=, <}', '{999, 999, 0, 0, 0}');
INSERT INTO brinopers VALUES ('int4col', '{>, >=, =, <=, <}', '{1999, 1999, 0, 0, 0}');
INSERT INTO brinopers VALUES ('textcol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAA, AAAAA, AAAAA}');
INSERT INTO brinopers VALUES ('oidcol', '{>, >=, =, <=, <}', '{9999, 9999, 0, 0, 0}');
INSERT INTO brinopers VALUES ('tidcol', '{>, >=, =, <=, <}', '{"(9999,19)", "(9999,19)", "(0,0)", "(0,0)", "(0,0)"}');
INSERT INTO brinopers VALUES ('float4col', '{>, >=, =, <=, <}', '{1, 1, 0.0103093, 0.0103093, 0.0103093}');
INSERT INTO brinopers VALUES ('float8col', '{>, >=, =, <=, <}', '{1.98, 1.98, 0, 0, 0}');
INSERT INTO brinopers VALUES ('inetcol', '{>, >=, =, <=, <}', '{10.2.42.19, 10.2.42.19, 10.2.3.4, 10.2.3.4, 10.2.3.4}');
INSERT INTO brinopers VALUES ('bpcharcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}');
INSERT INTO brinopers VALUES ('datecol', '{>, >=, =, <=, <}', '{2022-12-30, 2022-12-30, 1995-08-15, 1995-08-15, 1995-08-15}');
INSERT INTO brinopers VALUES ('timecol', '{>, >=, =, <=, <}', '{06:28:31.5, 06:28:31.5, 01:20:30, 01:20:30, 01:20:30}');
INSERT INTO brinopers VALUES ('timestampcol', '{>, >=, =, <=, <}', '{1984-01-20 22:42:21, 1984-01-20 22:42:21, 1942-07-23 03:05:09, 1942-07-23 03:05:09, 1942-07-23 03:05:09}');
INSERT INTO brinopers VALUES ('timestamptzcol', '{>, >=, =, <=, <}', '{1972-11-20 19:00:00-03, 1972-11-20 19:00:00-03, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04}');
INSERT INTO brinopers VALUES ('intervalcol', '{>, >=, =, <=, <}', '{2 mons 23 days 07:48:00, 2 mons 23 days 07:48:00, 00:00:00, 00:00:00, 00:00:00}');
INSERT INTO brinopers VALUES ('timetzcol', '{>, >=, =, <=, <}', '{01:55:05-03, 01:55:05-03, 01:30:20-03, 01:30:20-03, 01:30:20-03}');
INSERT INTO brinopers VALUES ('numericcol', '{>, >=, =, <=, <}', '{99470151.9, 99470151.9, 0.00, 0.01, 0.01}');
INSERT INTO brinopers VALUES ('macaddrcol', '{>, >=, =, <=, <}', '{ff:fe:00:00:00:00, ff:fe:00:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00}');
INSERT INTO brinopers VALUES ('bitcol', '{>, >=, =, <=, <}', '{1111111000, 1111111000, 0000000010, 0000000010, 0000000010}');
INSERT INTO brinopers VALUES ('varbitcol', '{>, >=, =, <=, <}', '{1111111111111000, 1111111111111000, 0000000000000100, 0000000000000100, 0000000000000100}');
INSERT INTO brinopers VALUES ('uuidcol', '{>, >=, =, <=, <}', '{99989998-9998-9998-9998-999899989998, 99989998-9998-9998-9998-999899989998, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040005}');
INSERT INTO brinopers VALUES ('lsncol', '{>, >=, =, <=, <}', '{198/1999799, 198/1999799, 30/312815, 0/1200, 0/1200}');
DO $x$
DECLARE
r record;
tabname text;
tabname_ss text;
count int;
query text;
plan text;
BEGIN
FOR r IN SELECT row_number() OVER (), colname, oper, value[ordinality] FROM brinopers, unnest(op) WITH ORDINALITY AS oper LOOP
tabname := format('qry_%s', r.row_number);
tabname_ss := tabname || '_ss';
query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$,
tabname, r.colname, r.oper, r.value);
-- run the query using the brin index
SET enable_seqscan = 0;
SET enable_bitmapscan = 1;
EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP*/', tabname);
EXECUTE query;
-- run the query using a seqscan
SET enable_seqscan = 1;
SET enable_bitmapscan = 0;
query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$,
tabname_ss, r.colname, r.oper, r.value);
EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP */', tabname_ss);
EXECUTE query;
-- make sure both return the same results
EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname, tabname_ss);
GET DIAGNOSTICS count = ROW_COUNT;
IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF;
EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname_ss, tabname);
GET DIAGNOSTICS count = ROW_COUNT;
IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF;
end loop;
end;
$x$;
INSERT INTO brintest SELECT
repeat(stringu1, 42)::bytea,
substr(stringu1, 1, 1)::"char",
stringu1::name, 142857 * tenthous,
thousand,
twothousand,
repeat(stringu1, 42),
unique1::oid,
format('(%s,%s)', tenthous, twenty)::tid,
(four + 1.0)/(hundred+1),
odd::float8 / (tenthous + 1),
format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr,
inet '10.2.3.4' + tenthous,
substr(stringu1, 1, 1)::bpchar,
date '1995-08-15' + tenthous,
time '01:20:30' + thousand * interval '18.5 second',
timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours',
timestamptz '1972-10-10 03:00' + thousand * interval '1 hour',
justify_days(justify_hours(tenthous * interval '12 minutes')),
timetz '01:30:20' + hundred * interval '15 seconds',
thousand::bit(10),
tenthous::bit(16)::varbit,
tenthous::numeric(36,30) * fivethous * even / (hundred + 1),
format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid,
format('%s/%s%s', odd, even, tenthous)::pg_lsn
FROM tenk1;
SELECT brin_summarize_new_values('brinidx'::regclass);
UPDATE brintest SET int8col = int8col * int4col;
SET synchronous_commit = 1;
......@@ -1195,11 +1195,13 @@ WHERE NOT (
-- GIN has six support functions. 1-3 are mandatory, 5 is optional, and
-- at least one of 4 and 6 must be given.
-- SP-GiST has five support functions, all mandatory
-- BRIN has four mandatory support functions, and a bunch of optionals
amname = 'btree' AND procnums @> '{1}' OR
amname = 'hash' AND procnums = '{1}' OR
amname = 'gist' AND procnums @> '{1, 2, 3, 4, 5, 6, 7}' OR
amname = 'gin' AND (procnums @> '{1, 2, 3}' AND (procnums && '{4, 6}')) OR
amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}'
amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' OR
amname = 'brin' AND procnums @> '{1, 2, 3, 4}'
);
-- Also, check if there are any pg_opclass entries that don't seem to have
......@@ -1218,7 +1220,8 @@ WHERE NOT (
amname = 'hash' AND procnums = '{1}' OR
amname = 'gist' AND procnums @> '{1, 2, 3, 4, 5, 6, 7}' OR
amname = 'gin' AND (procnums @> '{1, 2, 3}' AND (procnums && '{4, 6}')) OR
amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}'
amname = 'spgist' AND procnums = '{1, 2, 3, 4, 5}' OR
amname = 'brin' AND procnums @> '{1, 2, 3, 4}'
);
-- Unfortunately, we can't check the amproc link very well because the
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment