Commit 7516f525 authored by Alvaro Herrera's avatar Alvaro Herrera

BRIN: Block Range Indexes

BRIN is a new index access method intended to accelerate scans of very
large tables, without the maintenance overhead of btrees or other
traditional indexes.  They work by maintaining "summary" data about
block ranges.  Bitmap index scans work by reading each summary tuple and
comparing them with the query quals; all pages in the range are returned
in a lossy TID bitmap if the quals are consistent with the values in the
summary tuple, otherwise not.  Normal index scans are not supported
because these indexes do not store TIDs.

As new tuples are added into the index, the summary information is
updated (if the block range in which the tuple is added is already
summarized) or not; in the latter case, a subsequent pass of VACUUM or
the brin_summarize_new_values() function will create the summary
information.

For data types with natural 1-D sort orders, the summary info consists
of the maximum and the minimum values of each indexed column within each
page range.  This type of operator class we call "Minmax", and we
supply a bunch of them for most data types with B-tree opclasses.
Since the BRIN code is generalized, other approaches are possible for
things such as arrays, geometric types, ranges, etc; even for things
such as enum types we could do something different than minmax with
better results.  In this commit I only include minmax.

Catalog version bumped due to new builtin catalog entries.

There's more that could be done here, but this is a good step forwards.

Loosely based on ideas from Simon Riggs; code mostly by Álvaro Herrera,
with contribution by Heikki Linnakangas.

Patch reviewed by: Amit Kapila, Heikki Linnakangas, Robert Haas.
Testing help from Jeff Janes, Erik Rijkers, Emanuel Calvo.

PS:
  The research leading to these results has received funding from the
  European Union's Seventh Framework Programme (FP7/2007-2013) under
  grant agreement n° 318633.
parent 1961b1c1
# contrib/pageinspect/Makefile
MODULE_big = pageinspect
OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o $(WIN32RES)
OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o brinfuncs.o $(WIN32RES)
EXTENSION = pageinspect
DATA = pageinspect--1.2.sql pageinspect--1.0--1.1.sql \
DATA = pageinspect--1.3.sql pageinspect--1.0--1.1.sql \
pageinspect--1.2--1.3.sql \
pageinspect--1.1--1.2.sql pageinspect--unpackaged--1.0.sql
PGFILEDESC = "pageinspect - functions to inspect contents of database pages"
......
This diff is collapsed.
/* contrib/pageinspect/pageinspect--1.2--1.3.sql */
-- complain if script is sourced in psql, rather than via ALTER EXTENSION
\echo Use "ALTER EXTENSION pageinspect UPDATE TO '1.3'" to load this file. \quit
--
-- brin_page_type()
--
CREATE FUNCTION brin_page_type(IN page bytea)
RETURNS text
AS 'MODULE_PATHNAME', 'brin_page_type'
LANGUAGE C STRICT;
--
-- brin_metapage_info()
--
CREATE FUNCTION brin_metapage_info(IN page bytea, OUT magic text,
OUT version integer, OUT pagesperrange integer, OUT lastrevmappage bigint)
AS 'MODULE_PATHNAME', 'brin_metapage_info'
LANGUAGE C STRICT;
--
-- brin_revmap_data()
CREATE FUNCTION brin_revmap_data(IN page bytea,
OUT pages tid)
RETURNS SETOF tid
AS 'MODULE_PATHNAME', 'brin_revmap_data'
LANGUAGE C STRICT;
--
-- brin_page_items()
--
CREATE FUNCTION brin_page_items(IN page bytea, IN index_oid regclass,
OUT itemoffset int,
OUT blknum int,
OUT attnum int,
OUT allnulls bool,
OUT hasnulls bool,
OUT placeholder bool,
OUT value text)
RETURNS SETOF record
AS 'MODULE_PATHNAME', 'brin_page_items'
LANGUAGE C STRICT;
/* contrib/pageinspect/pageinspect--1.2.sql */
/* contrib/pageinspect/pageinspect--1.3.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION pageinspect" to load this file. \quit
......@@ -98,6 +98,45 @@ RETURNS SETOF record
AS 'MODULE_PATHNAME', 'bt_page_items'
LANGUAGE C STRICT;
--
-- brin_page_type()
--
CREATE FUNCTION brin_page_type(IN page bytea)
RETURNS text
AS 'MODULE_PATHNAME', 'brin_page_type'
LANGUAGE C STRICT;
--
-- brin_metapage_info()
--
CREATE FUNCTION brin_metapage_info(IN page bytea, OUT magic text,
OUT version integer, OUT pagesperrange integer, OUT lastrevmappage bigint)
AS 'MODULE_PATHNAME', 'brin_metapage_info'
LANGUAGE C STRICT;
--
-- brin_revmap_data()
CREATE FUNCTION brin_revmap_data(IN page bytea,
OUT pages tid)
RETURNS SETOF tid
AS 'MODULE_PATHNAME', 'brin_revmap_data'
LANGUAGE C STRICT;
--
-- brin_page_items()
--
CREATE FUNCTION brin_page_items(IN page bytea, IN index_oid regclass,
OUT itemoffset int,
OUT blknum int,
OUT attnum int,
OUT allnulls bool,
OUT hasnulls bool,
OUT placeholder bool,
OUT value text)
RETURNS SETOF record
AS 'MODULE_PATHNAME', 'brin_page_items'
LANGUAGE C STRICT;
--
-- fsm_page_contents()
--
......
# pageinspect extension
comment = 'inspect the contents of database pages at a low level'
default_version = '1.2'
default_version = '1.3'
module_pathname = '$libdir/pageinspect'
relocatable = true
......@@ -8,6 +8,7 @@
#define FRONTEND 1
#include "postgres.h"
#include "access/brin_xlog.h"
#include "access/clog.h"
#include "access/gin.h"
#include "access/gist_private.h"
......
This diff is collapsed.
......@@ -87,6 +87,7 @@
<!ENTITY gist SYSTEM "gist.sgml">
<!ENTITY spgist SYSTEM "spgist.sgml">
<!ENTITY gin SYSTEM "gin.sgml">
<!ENTITY brin SYSTEM "brin.sgml">
<!ENTITY planstats SYSTEM "planstats.sgml">
<!ENTITY indexam SYSTEM "indexam.sgml">
<!ENTITY nls SYSTEM "nls.sgml">
......
......@@ -116,7 +116,8 @@ CREATE INDEX test1_id_index ON test1 (id);
<para>
<productname>PostgreSQL</productname> provides several index types:
B-tree, Hash, GiST, SP-GiST and GIN. Each index type uses a different
B-tree, Hash, GiST, SP-GiST, GIN and BRIN.
Each index type uses a different
algorithm that is best suited to different types of queries.
By default, the <command>CREATE INDEX</command> command creates
B-tree indexes, which fit the most common situations.
......@@ -326,6 +327,39 @@ SELECT * FROM places ORDER BY location <-> point '(101,456)' LIMIT 10;
classes are available in the <literal>contrib</> collection or as separate
projects. For more information see <xref linkend="GIN">.
</para>
<para>
<indexterm>
<primary>index</primary>
<secondary>BRIN</secondary>
</indexterm>
<indexterm>
<primary>BRIN</primary>
<see>index</see>
</indexterm>
BRIN indexes (a shorthand for Block Range indexes)
store summaries about the values stored in consecutive table physical block ranges.
Like GiST, SP-GiST and GIN,
BRIN can support many different indexing strategies,
and the particular operators with which a BRIN index can be used
vary depending on the indexing strategy.
For datatypes that have a linear sort order, the indexed data
corresponds to the minimum and maximum values of the
values in the column for each block range,
which support indexed queries using these operators:
<simplelist>
<member><literal>&lt;</literal></member>
<member><literal>&lt;=</literal></member>
<member><literal>=</literal></member>
<member><literal>&gt;=</literal></member>
<member><literal>&gt;</literal></member>
</simplelist>
The BRIN operator classes included in the standard distribution are
documented in <xref linkend="brin-builtin-opclasses-table">.
For more information see <xref linkend="BRIN">.
</para>
</sect1>
......
......@@ -196,6 +196,110 @@ test=# SELECT * FROM bt_page_items('pg_cast_oid_index', 1);
</listitem>
</varlistentry>
<varlistentry>
<term>
<function>brin_page_type(page bytea) returns text</function>
<indexterm>
<primary>brin_page_type</primary>
</indexterm>
</term>
<listitem>
<para>
<function>brin_page_type</function> returns the page type of the given
<acronym>BRIN</acronym> index page, or throws an error if the page is
not a valid <acronym>BRIN</acronym> page. For example:
<screen>
brintest=# select brin_page_type(get_raw_page('brinidx', 0));
brin_page_type
----------------
meta
</screen>
</para>
</listitem>
</varlistentry>
<varlistentry>
<term>
<function>brin_metapage_info(page bytea) returns record</function>
<indexterm>
<primary>brin_metapage_info</primary>
</indexterm>
</term>
<listitem>
<para>
<function>brin_metapage_info</function> returns assorted information
about a <acronym>BRIN</acronym> index metapage. For example:
<screen>
brintest=# select * from brin_metapage_info(get_raw_page('brinidx', 0));
magic | version | pagesperrange | lastrevmappage
------------+---------+---------------+----------------
0xA8109CFA | 1 | 4 | 2
</screen>
</para>
</listitem>
</varlistentry>
<varlistentry>
<term>
<function>brin_revmap_data(page bytea) returns setof tid</function>
<indexterm>
<primary>brin_revmap_data</primary>
</indexterm>
</term>
<listitem>
<para>
<function>brin_revmap_data</function> returns the list of tuple
identifiers in a <acronym>BRIN</acronym> index range map page.
For example:
<screen>
brintest=# select * from brin_revmap_data(get_raw_page('brinidx', 2)) limit 5;
pages
---------
(6,137)
(6,138)
(6,139)
(6,140)
(6,141)
</screen>
</para>
</listitem>
</varlistentry>
<varlistentry>
<term>
<function>brin_page_items(page bytea, index oid) returns setof record</function>
<indexterm>
<primary>brin_page_items</primary>
</indexterm>
</term>
<listitem>
<para>
<function>brin_page_items</function> returns the data stored in the
<acronym>BRIN</acronym> data page. For example:
<screen>
brintest=# select * from brin_page_items(get_raw_page('brinidx', 5),
brintest(# 'brinidx')
brintest-# order by blknum, attnum limit 6;
itemoffset | blknum | attnum | allnulls | hasnulls | placeholder | value
------------+--------+--------+----------+----------+-------------+--------------
137 | 0 | 1 | t | f | f |
137 | 0 | 2 | f | f | f | {1 .. 88}
138 | 4 | 1 | t | f | f |
138 | 4 | 2 | f | f | f | {89 .. 176}
139 | 8 | 1 | t | f | f |
139 | 8 | 2 | f | f | f | {177 .. 264}
</screen>
The returned columns correspond to the fields in the
<structname>BrinMemTuple</> and <structname>BrinValues</> structs.
See <filename>src/include/access/brin_tuple.h</> for details.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term>
<function>fsm_page_contents(page bytea) returns text</function>
......
......@@ -247,6 +247,7 @@
&gist;
&spgist;
&gin;
&brin;
&storage;
&bki;
&planstats;
......
......@@ -8,6 +8,6 @@ subdir = src/backend/access
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
SUBDIRS = common gin gist hash heap index nbtree rmgrdesc spgist transam
SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist transam
include $(top_srcdir)/src/backend/common.mk
#-------------------------------------------------------------------------
#
# Makefile--
# Makefile for access/brin
#
# IDENTIFICATION
# src/backend/access/brin/Makefile
#
#-------------------------------------------------------------------------
subdir = src/backend/access/brin
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
OBJS = brin.o brin_pageops.o brin_revmap.o brin_tuple.o brin_xlog.o \
brin_minmax.o
include $(top_srcdir)/src/backend/common.mk
Block Range Indexes (BRIN)
==========================
BRIN indexes intend to enable very fast scanning of extremely large tables.
The essential idea of a BRIN index is to keep track of summarizing values in
consecutive groups of heap pages (page ranges); for example, the minimum and
maximum values for datatypes with a btree opclass, or the bounding box for
geometric types. These values can be used to avoid scanning such pages
during a table scan, depending on query quals.
The cost of this is having to update the stored summary values of each page
range as tuples are inserted into them.
Access Method Design
--------------------
Since item pointers are not stored inside indexes of this type, it is not
possible to support the amgettuple interface. Instead, we only provide
amgetbitmap support. The amgetbitmap routine returns a lossy TIDBitmap
comprising all pages in those page ranges that match the query
qualifications. The recheck step in the BitmapHeapScan node prunes tuples
that are not visible according to the query qualifications.
An operator class must have the following entries:
- generic support procedures (pg_amproc), identical to all opclasses:
* "opcinfo" (BRIN_PROCNUM_OPCINFO) initializes a structure for index
creation or scanning
* "addValue" (BRIN_PROCNUM_ADDVALUE) takes an index tuple and a heap item,
and possibly changes the index tuple so that it includes the heap item
values
* "consistent" (BRIN_PROCNUM_CONSISTENT) takes an index tuple and query
quals, and returns whether the index tuple values match the query quals.
* "union" (BRIN_PROCNUM_UNION) takes two index tuples and modifies the first
one so that it represents the union of the two.
Procedure numbers up to 10 are reserved for future expansion.
Additionally, each opclass needs additional support functions:
- Minmax-style operator classes:
* Proc numbers 11-14 are used for the functions implementing inequality
operators for the type, in this order: less than, less or equal,
greater or equal, greater than.
Opclasses using a different design will require different additional procedure
numbers.
Operator classes also need to have operator (pg_amop) entries so that the
optimizer can choose the index to execute queries.
- Minmax-style operator classes:
* The same operators as btree (<=, <, =, >=, >)
Each index tuple stores some NULL bits and some opclass-specified values, which
are stored in a single null bitmask of length twice the number of columns. The
generic NULL bits indicate, for each column:
* bt_hasnulls: Whether there's any NULL value at all in the page range
* bt_allnulls: Whether all values are NULLs in the page range
The opclass-specified values are:
- Minmax-style operator classes
* minimum value across all tuples in the range
* maximum value across all tuples in the range
Note that the addValue and Union support procedures must be careful to
datumCopy() the values they want to store in the in-memory BRIN tuple, and
must pfree() the old copies when replacing older ones. Since some values
referenced from the tuple persist and others go away, there is no
well-defined lifetime for a memory context that would make this automatic.
The Range Map
-------------
To find the index tuple for a particular page range, we have an internal
structure we call the range map, or "revmap" for short. This stores one TID
per page range, which is the address of the index tuple summarizing that
range. Since the map entries are fixed size, it is possible to compute the
address of the range map entry for any given heap page by simple arithmetic.
When a new heap tuple is inserted in a summarized page range, we compare the
existing index tuple with the new heap tuple. If the heap tuple is outside
the summarization data given by the index tuple for any indexed column (or
if the new heap tuple contains null values but the index tuple indicates
there are no nulls), the index is updated with the new values. In many
cases it is possible to update the index tuple in-place, but if the new
index tuple is larger than the old one and there's not enough space in the
page, it is necessary to create a new index tuple with the new values. The
range map can be updated quickly to point to it; the old index tuple is
removed.
If the range map points to an invalid TID, the corresponding page range is
considered to be not summarized. When tuples are added to unsummarized
pages, nothing needs to happen.
To scan a table following a BRIN index, we scan the range map sequentially.
This yields index tuples in ascending page range order. Query quals are
matched to each index tuple; if they match, each page within the page range
is returned as part of the output TID bitmap. If there's no match, they are
skipped. Range map entries returning invalid index TIDs, that is
unsummarized page ranges, are also returned in the TID bitmap.
The revmap is stored in the first few blocks of the index main fork,
immediately following the metapage. Whenever the revmap needs to be
extended by another page, existing tuples in that page are moved to some
other page.
Heap tuples can be removed from anywhere without restriction. It might be
useful to mark the corresponding index tuple somehow, if the heap tuple is
one of the constraining values of the summary data (i.e. either min or max
in the case of a btree-opclass-bearing datatype), so that in the future we
are aware of the need to re-execute summarization on that range, leading to
a possible tightening of the summary values.
Summarization
-------------
At index creation time, the whole table is scanned; for each page range the
summarizing values of each indexed column and nulls bitmap are collected and
stored in the index. The partially-filled page range at the end of the
table is also summarized.
As new tuples get inserted at the end of the table, they may update the
index tuple that summarizes the partial page range at the end. Eventually
that page range is complete and new tuples belong in a new page range that
hasn't yet been summarized. Those insertions do not create a new index
entry; instead, the page range remains unsummarized until later.
Wehn VACUUM is run on the table, all unsummarized page ranges are
summarized. This action can also be invoked by the user via
brin_summarize_new_values(). Both these procedures scan all the
unsummarized ranges, and create a summary tuple. Again, this includes the
partially-filled page range at the end of the table.
Vacuuming
---------
Since no heap TIDs are stored in a BRIN index, it's not necessary to scan the
index when heap tuples are removed. It might be that some summary values can
be tightened if heap tuples have been deleted; but this would represent an
optimization opportunity only, not a correctness issue. It's simpler to
represent this as the need to re-run summarization on the affected page range
rather than "subtracting" values from the existing one. This is not
currently implemented.
Note that if there are no indexes on the table other than the BRIN index,
usage of maintenance_work_mem by vacuum can be decreased significantly, because
no detailed index scan needs to take place (and thus it's not necessary for
vacuum to save TIDs to remove). It's unlikely that BRIN would be the only
indexes in a table, though, because primary keys can be btrees only, and so
we don't implement this optimization.
Optimizer
---------
The optimizer selects the index based on the operator class' pg_amop
entries for the column.
Future improvements
-------------------
* Different-size page ranges?
In the current design, each "index entry" in a BRIN index covers the same
number of pages. There's no hard reason for this; it might make sense to
allow the index to self-tune so that some index entries cover smaller page
ranges, if this allows the summary values to be more compact. This would incur
larger BRIN overhead for the index itself, but might allow better pruning of
page ranges during scan. In the limit of one index tuple per page, the index
itself would occupy too much space, even though we would be able to skip
reading the most heap pages, because the summary values are tight; in the
opposite limit of a single tuple that summarizes the whole table, we wouldn't
be able to prune anything even though the index is very small. This can
probably be made to work by using the range map as an index in itself.
* More compact representation for TIDBitmap?
TIDBitmap is the structure used to represent bitmap scans. The
representation of lossy page ranges is not optimal for our purposes, because
it uses a Bitmapset to represent pages in the range; since we're going to return
all pages in a large range, it might be more convenient to allow for a
struct that uses start and end page numbers to represent the range, instead.
* Better vacuuming?
It might be useful to enable passing more useful info to BRIN indexes during
vacuuming about tuples that are deleted, i.e. do not require the callback to
pass each tuple's TID. For instance we might need a callback that passes a
block number instead of a TID. That would help determine when to re-run
summarization on blocks that have seen lots of tuple deletions.
This diff is collapsed.
/*
* brin_minmax.c
* Implementation of Min/Max opclass for BRIN
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/access/brin/brin_minmax.c
*/
#include "postgres.h"
#include "access/genam.h"
#include "access/brin_internal.h"
#include "access/brin_tuple.h"
#include "access/skey.h"
#include "catalog/pg_type.h"
#include "utils/datum.h"
#include "utils/lsyscache.h"
#include "utils/syscache.h"
/*
* Procedure numbers must not collide with BRIN_PROCNUM defines in
* brin_internal.h. Note we only need inequality functions.
*/
#define MINMAX_NUM_PROCNUMS 4 /* # support procs we need */
#define PROCNUM_LESS 11
#define PROCNUM_LESSEQUAL 12
#define PROCNUM_GREATEREQUAL 13
#define PROCNUM_GREATER 14
/*
* Subtract this from procnum to obtain index in MinmaxOpaque arrays
* (Must be equal to minimum of private procnums)
*/
#define PROCNUM_BASE 11
static FmgrInfo *minmax_get_procinfo(BrinDesc *bdesc, uint16 attno,
uint16 procnum);
PG_FUNCTION_INFO_V1(minmaxOpcInfo);
PG_FUNCTION_INFO_V1(minmaxAddValue);
PG_FUNCTION_INFO_V1(minmaxConsistent);
PG_FUNCTION_INFO_V1(minmaxUnion);
typedef struct MinmaxOpaque
{
FmgrInfo operators[MINMAX_NUM_PROCNUMS];
bool inited[MINMAX_NUM_PROCNUMS];
} MinmaxOpaque;
Datum
minmaxOpcInfo(PG_FUNCTION_ARGS)
{
Oid typoid = PG_GETARG_OID(0);
BrinOpcInfo *result;
/*
* opaque->operators is initialized lazily, as indicated by 'inited' which
* is initialized to all false by palloc0.
*/
result = palloc0(MAXALIGN(SizeofBrinOpcInfo(2)) +
sizeof(MinmaxOpaque));
result->oi_nstored = 2;
result->oi_opaque = (MinmaxOpaque *)
MAXALIGN((char *) result + SizeofBrinOpcInfo(2));
result->oi_typids[0] = typoid;
result->oi_typids[1] = typoid;
PG_RETURN_POINTER(result);
}
/*
* Examine the given index tuple (which contains partial status of a certain
* page range) by comparing it to the given value that comes from another heap
* tuple. If the new value is outside the min/max range specified by the
* existing tuple values, update the index tuple and return true. Otherwise,
* return false and do not modify in this case.
*/
Datum
minmaxAddValue(PG_FUNCTION_ARGS)
{
BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
Datum newval = PG_GETARG_DATUM(2);
bool isnull = PG_GETARG_DATUM(3);
Oid colloid = PG_GET_COLLATION();
FmgrInfo *cmpFn;
Datum compar;
bool updated = false;
Form_pg_attribute attr;
AttrNumber attno;
/*
* If the new value is null, we record that we saw it if it's the first
* one; otherwise, there's nothing to do.
*/
if (isnull)
{
if (column->bv_hasnulls)
PG_RETURN_BOOL(false);
column->bv_hasnulls = true;
PG_RETURN_BOOL(true);
}
attno = column->bv_attno;
attr = bdesc->bd_tupdesc->attrs[attno - 1];
/*
* If the recorded value is null, store the new value (which we know to be
* not null) as both minimum and maximum, and we're done.
*/
if (column->bv_allnulls)
{
column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen);
column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen);
column->bv_allnulls = false;
PG_RETURN_BOOL(true);
}
/*
* Otherwise, need to compare the new value with the existing boundaries
* and update them accordingly. First check if it's less than the
* existing minimum.
*/
cmpFn = minmax_get_procinfo(bdesc, attno, PROCNUM_LESS);
compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[0]);
if (DatumGetBool(compar))
{
if (!attr->attbyval)
pfree(DatumGetPointer(column->bv_values[0]));
column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen);
updated = true;
}
/*
* And now compare it to the existing maximum.
*/
cmpFn = minmax_get_procinfo(bdesc, attno, PROCNUM_GREATER);
compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[1]);
if (DatumGetBool(compar))
{
if (!attr->attbyval)
pfree(DatumGetPointer(column->bv_values[1]));
column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen);
updated = true;
}
PG_RETURN_BOOL(updated);
}
/*
* Given an index tuple corresponding to a certain page range and a scan key,
* return whether the scan key is consistent with the index tuple's min/max
* values. Return true if so, false otherwise.
*/
Datum
minmaxConsistent(PG_FUNCTION_ARGS)
{
BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
ScanKey key = (ScanKey) PG_GETARG_POINTER(2);
Oid colloid = PG_GET_COLLATION();
AttrNumber attno;
Datum value;
Datum matches;
Assert(key->sk_attno == column->bv_attno);
/* handle IS NULL/IS NOT NULL tests */
if (key->sk_flags & SK_ISNULL)
{
if (key->sk_flags & SK_SEARCHNULL)
{
if (column->bv_allnulls || column->bv_hasnulls)
PG_RETURN_BOOL(true);
PG_RETURN_BOOL(false);
}
/*
* For IS NOT NULL, we can only skip ranges that are known to have
* only nulls.
*/
Assert(key->sk_flags & SK_SEARCHNOTNULL);
PG_RETURN_BOOL(!column->bv_allnulls);
}
/* if the range is all empty, it cannot possibly be consistent */
if (column->bv_allnulls)
PG_RETURN_BOOL(false);
attno = key->sk_attno;
value = key->sk_argument;
switch (key->sk_strategy)
{
case BTLessStrategyNumber:
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
PROCNUM_LESS),
colloid, column->bv_values[0], value);
break;
case BTLessEqualStrategyNumber:
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
PROCNUM_LESSEQUAL),
colloid, column->bv_values[0], value);
break;
case BTEqualStrategyNumber:
/*
* In the equality case (WHERE col = someval), we want to return
* the current page range if the minimum value in the range <=
* scan key, and the maximum value >= scan key.
*/
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
PROCNUM_LESSEQUAL),
colloid, column->bv_values[0], value);
if (!DatumGetBool(matches))
break;
/* max() >= scankey */
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
PROCNUM_GREATEREQUAL),
colloid, column->bv_values[1], value);
break;
case BTGreaterEqualStrategyNumber:
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
PROCNUM_GREATEREQUAL),
colloid, column->bv_values[1], value);
break;
case BTGreaterStrategyNumber:
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
PROCNUM_GREATER),
colloid, column->bv_values[1], value);
break;
default:
/* shouldn't happen */
elog(ERROR, "invalid strategy number %d", key->sk_strategy);
matches = 0;
break;
}
PG_RETURN_DATUM(matches);
}
/*
* Given two BrinValues, update the first of them as a union of the summary
* values contained in both. The second one is untouched.
*/
Datum
minmaxUnion(PG_FUNCTION_ARGS)
{
BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1);
BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2);
Oid colloid = PG_GET_COLLATION();
AttrNumber attno;
Form_pg_attribute attr;
bool needsadj;
Assert(col_a->bv_attno == col_b->bv_attno);
/* If there are no values in B, there's nothing to do */
if (col_b->bv_allnulls)
PG_RETURN_VOID();
attno = col_a->bv_attno;
attr = bdesc->bd_tupdesc->attrs[attno - 1];
/* Adjust "hasnulls" */
if (col_b->bv_hasnulls && !col_a->bv_hasnulls)
col_a->bv_hasnulls = true;
/*
* Adjust "allnulls". If B has values but A doesn't, just copy the values
* from B into A, and we're done. (We cannot run the operators in this
* case, because values in A might contain garbage.)
*/
if (!col_b->bv_allnulls && col_a->bv_allnulls)
{
col_a->bv_allnulls = false;
col_a->bv_values[0] = datumCopy(col_b->bv_values[0],
attr->attbyval, attr->attlen);
col_a->bv_values[1] = datumCopy(col_b->bv_values[1],
attr->attbyval, attr->attlen);
PG_RETURN_VOID();
}
/* Adjust minimum, if B's min is less than A's min */
needsadj = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
PROCNUM_LESS),
colloid, col_b->bv_values[0], col_a->bv_values[0]);
if (needsadj)
{
if (!attr->attbyval)
pfree(DatumGetPointer(col_a->bv_values[0]));
col_a->bv_values[0] = datumCopy(col_b->bv_values[0],
attr->attbyval, attr->attlen);
}
/* Adjust maximum, if B's max is greater than A's max */
needsadj = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno,
PROCNUM_GREATER),
colloid, col_b->bv_values[1], col_a->bv_values[1]);
if (needsadj)
{
if (!attr->attbyval)
pfree(DatumGetPointer(col_a->bv_values[1]));
col_a->bv_values[1] = datumCopy(col_b->bv_values[1],
attr->attbyval, attr->attlen);
}
PG_RETURN_VOID();
}
/*
* Return the procedure corresponding to the given function support number.
*/
static FmgrInfo *
minmax_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum)
{
MinmaxOpaque *opaque;
uint16 basenum = procnum - PROCNUM_BASE;
opaque = (MinmaxOpaque *) bdesc->bd_info[attno - 1]->oi_opaque;
/*
* We cache these in the opaque struct, to avoid repetitive syscache
* lookups.
*/
if (!opaque->inited[basenum])
{
fmgr_info_copy(&opaque->operators[basenum],
index_getprocinfo(bdesc->bd_index, attno, procnum),
bdesc->bd_context);
opaque->inited[basenum] = true;
}
return &opaque->operators[basenum];
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -209,6 +209,13 @@ static relopt_int intRelOpts[] =
RELOPT_KIND_HEAP | RELOPT_KIND_TOAST
}, -1, 0, 2000000000
},
{
{
"pages_per_range",
"Number of pages that each page range covers in a BRIN index",
RELOPT_KIND_BRIN
}, 128, 1, 131072
},
/* list terminator */
{{NULL}}
......
......@@ -272,6 +272,8 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan)
scan->rs_startblock = 0;
}
scan->rs_initblock = 0;
scan->rs_numblocks = InvalidBlockNumber;
scan->rs_inited = false;
scan->rs_ctup.t_data = NULL;
ItemPointerSetInvalid(&scan->rs_ctup.t_self);
......@@ -297,6 +299,14 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan)
pgstat_count_heap_scan(scan->rs_rd);
}
void
heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, BlockNumber numBlks)
{
scan->rs_startblock = startBlk;
scan->rs_initblock = startBlk;
scan->rs_numblocks = numBlks;
}
/*
* heapgetpage - subroutine for heapgettup()
*
......@@ -637,7 +647,8 @@ heapgettup(HeapScanDesc scan,
*/
if (backward)
{
finished = (page == scan->rs_startblock);
finished = (page == scan->rs_startblock) ||
(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false);
if (page == 0)
page = scan->rs_nblocks;
page--;
......@@ -647,7 +658,8 @@ heapgettup(HeapScanDesc scan,
page++;
if (page >= scan->rs_nblocks)
page = 0;
finished = (page == scan->rs_startblock);
finished = (page == scan->rs_startblock) ||
(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false);
/*
* Report our new scan position for synchronization purposes. We
......@@ -898,7 +910,8 @@ heapgettup_pagemode(HeapScanDesc scan,
*/
if (backward)
{
finished = (page == scan->rs_startblock);
finished = (page == scan->rs_startblock) ||
(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false);
if (page == 0)
page = scan->rs_nblocks;
page--;
......@@ -908,7 +921,8 @@ heapgettup_pagemode(HeapScanDesc scan,
page++;
if (page >= scan->rs_nblocks)
page = 0;
finished = (page == scan->rs_startblock);
finished = (page == scan->rs_startblock) ||
(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks <= 0 : false);
/*
* Report our new scan position for synchronization purposes. We
......
......@@ -8,7 +8,8 @@ subdir = src/backend/access/rmgrdesc
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
OBJS = clogdesc.o dbasedesc.o gindesc.o gistdesc.o hashdesc.o heapdesc.o \
OBJS = brindesc.o clogdesc.o dbasedesc.o gindesc.o gistdesc.o \
hashdesc.o heapdesc.o \
mxactdesc.o nbtdesc.o relmapdesc.o seqdesc.o smgrdesc.o spgdesc.o \
standbydesc.o tblspcdesc.o xactdesc.o xlogdesc.o
......
/*-------------------------------------------------------------------------
*
* brindesc.c
* rmgr descriptor routines for BRIN indexes
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/access/rmgrdesc/brindesc.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/brin_xlog.h"
void
brin_desc(StringInfo buf, XLogRecord *record)
{
char *rec = XLogRecGetData(record);
uint8 info = record->xl_info & ~XLR_INFO_MASK;
info &= XLOG_BRIN_OPMASK;
if (info == XLOG_BRIN_CREATE_INDEX)
{
xl_brin_createidx *xlrec = (xl_brin_createidx *) rec;
appendStringInfo(buf, "v%d pagesPerRange %u rel %u/%u/%u",
xlrec->version, xlrec->pagesPerRange,
xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode);
}
else if (info == XLOG_BRIN_INSERT)
{
xl_brin_insert *xlrec = (xl_brin_insert *) rec;
appendStringInfo(buf, "rel %u/%u/%u heapBlk %u revmapBlk %u pagesPerRange %u TID (%u,%u)",
xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode,
xlrec->heapBlk, xlrec->revmapBlk,
xlrec->pagesPerRange,
ItemPointerGetBlockNumber(&xlrec->tid),
ItemPointerGetOffsetNumber(&xlrec->tid));
}
else if (info == XLOG_BRIN_UPDATE)
{
xl_brin_update *xlrec = (xl_brin_update *) rec;
appendStringInfo(buf, "rel %u/%u/%u heapBlk %u revmapBlk %u pagesPerRange %u old TID (%u,%u) TID (%u,%u)",
xlrec->new.node.spcNode, xlrec->new.node.dbNode,
xlrec->new.node.relNode,
xlrec->new.heapBlk, xlrec->new.revmapBlk,
xlrec->new.pagesPerRange,
ItemPointerGetBlockNumber(&xlrec->oldtid),
ItemPointerGetOffsetNumber(&xlrec->oldtid),
ItemPointerGetBlockNumber(&xlrec->new.tid),
ItemPointerGetOffsetNumber(&xlrec->new.tid));
}
else if (info == XLOG_BRIN_SAMEPAGE_UPDATE)
{
xl_brin_samepage_update *xlrec = (xl_brin_samepage_update *) rec;
appendStringInfo(buf, "rel %u/%u/%u TID (%u,%u)",
xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode,
ItemPointerGetBlockNumber(&xlrec->tid),
ItemPointerGetOffsetNumber(&xlrec->tid));
}
else if (info == XLOG_BRIN_REVMAP_EXTEND)
{
xl_brin_revmap_extend *xlrec = (xl_brin_revmap_extend *) rec;
appendStringInfo(buf, "rel %u/%u/%u targetBlk %u",
xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode, xlrec->targetBlk);
}
}
const char *
brin_identify(uint8 info)
{
const char *id = NULL;
switch (info & ~XLR_INFO_MASK)
{
case XLOG_BRIN_CREATE_INDEX:
id = "CREATE_INDEX";
break;
case XLOG_BRIN_INSERT:
id = "INSERT";
break;
case XLOG_BRIN_INSERT | XLOG_BRIN_INIT_PAGE:
id = "INSERT+INIT";
break;
case XLOG_BRIN_UPDATE:
id = "UPDATE";
break;
case XLOG_BRIN_UPDATE | XLOG_BRIN_INIT_PAGE:
id = "UPDATE+INIT";
break;
case XLOG_BRIN_SAMEPAGE_UPDATE:
id = "SAMEPAGE_UPDATE";
break;
case XLOG_BRIN_REVMAP_EXTEND:
id = "REVMAP_EXTEND";
break;
}
return id;
}
......@@ -12,6 +12,7 @@
#include "access/gist_private.h"
#include "access/hash.h"
#include "access/heapam_xlog.h"
#include "access/brin_xlog.h"
#include "access/multixact.h"
#include "access/nbtree.h"
#include "access/spgist.h"
......
......@@ -2103,6 +2103,27 @@ IndexBuildHeapScan(Relation heapRelation,
bool allow_sync,
IndexBuildCallback callback,
void *callback_state)
{
return IndexBuildHeapRangeScan(heapRelation, indexRelation,
indexInfo, allow_sync,
0, InvalidBlockNumber,
callback, callback_state);
}
/*
* As above, except that instead of scanning the complete heap, only the given
* number of blocks are scanned. Scan to end-of-rel can be signalled by
* passing InvalidBlockNumber as numblocks.
*/
double
IndexBuildHeapRangeScan(Relation heapRelation,
Relation indexRelation,
IndexInfo *indexInfo,
bool allow_sync,
BlockNumber start_blockno,
BlockNumber numblocks,
IndexBuildCallback callback,
void *callback_state)
{
bool is_system_catalog;
bool checking_uniqueness;
......@@ -2174,6 +2195,9 @@ IndexBuildHeapScan(Relation heapRelation,
true, /* buffer access strategy OK */
allow_sync); /* syncscan OK? */
/* set our scan endpoints */
heap_setscanlimits(scan, start_blockno, numblocks);
reltuples = 0;
/*
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment