Commit 9ee014fc authored by Teodor Sigaev's avatar Teodor Sigaev

Bloom index contrib module

Module provides new access method. It is actually a simple Bloom filter
implemented as pgsql's index. It could give some benefits on search
with large number of columns.

Module is a single way to test generic WAL interface committed earlier.

Author: Teodor Sigaev, Alexander Korotkov
Reviewers: Aleksander Alekseev, Michael Paquier, Jim Nasby
parent 4e56e5a6
......@@ -8,6 +8,7 @@ SUBDIRS = \
adminpack \
auth_delay \
auto_explain \
bloom \
btree_gin \
btree_gist \
chkpass \
......
# Generated subdirectories
/log/
/results/
/tmp_check/
# contrib/bloom/Makefile
MODULE_big = bloom
OBJS = blcost.o blinsert.o blscan.o blutils.o blvacuum.o blvalidate.o $(WIN32RES)
EXTENSION = bloom
DATA = bloom--1.0.sql
PGFILEDESC = "bloom access method - signature file based index"
REGRESS = bloom
ifdef USE_PGXS
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
else
subdir = contrib/bloom
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
include $(top_srcdir)/contrib/contrib-global.mk
endif
wal-check: temp-install
$(prove_check)
/*-------------------------------------------------------------------------
*
* blcost.c
* Cost estimate function for bloom indexes.
*
* Copyright (c) 2016, PostgreSQL Global Development Group
*
* IDENTIFICATION
* contrib/bloom/blcost.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "fmgr.h"
#include "optimizer/cost.h"
#include "utils/selfuncs.h"
#include "bloom.h"
/*
* Estimate cost of bloom index scan.
*/
void
blcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
Cost *indexStartupCost, Cost *indexTotalCost,
Selectivity *indexSelectivity, double *indexCorrelation)
{
IndexOptInfo *index = path->indexinfo;
List *qinfos;
GenericCosts costs;
/* Do preliminary analysis of indexquals */
qinfos = deconstruct_indexquals(path);
MemSet(&costs, 0, sizeof(costs));
/* We have to visit all index tuples anyway */
costs.numIndexTuples = index->tuples;
/* Use generic estimate */
genericcostestimate(root, path, loop_count, qinfos, &costs);
*indexStartupCost = costs.indexStartupCost;
*indexTotalCost = costs.indexTotalCost;
*indexSelectivity = costs.indexSelectivity;
*indexCorrelation = costs.indexCorrelation;
}
/*-------------------------------------------------------------------------
*
* blinsert.c
* Bloom index build and insert functions.
*
* Copyright (c) 2016, PostgreSQL Global Development Group
*
* IDENTIFICATION
* contrib/bloom/blinsert.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/genam.h"
#include "access/generic_xlog.h"
#include "catalog/index.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/indexfsm.h"
#include "utils/memutils.h"
#include "utils/rel.h"
#include "bloom.h"
PG_MODULE_MAGIC;
/*
* State of bloom index build. We accumulate one page data here before
* flushing it to buffer manager.
*/
typedef struct
{
BloomState blstate; /* bloom index state */
MemoryContext tmpCtx; /* temporary memory context reset after
* each tuple */
char data[BLCKSZ]; /* cached page */
int64 count; /* number of tuples in cached page */
} BloomBuildState;
/*
* Flush page cached in BloomBuildState.
*/
static void
flushCachedPage(Relation index, BloomBuildState *buildstate)
{
Page page;
Buffer buffer = BloomNewBuffer(index);
GenericXLogState *state;
state = GenericXLogStart(index);
page = GenericXLogRegister(state, buffer, true);
memcpy(page, buildstate->data, BLCKSZ);
GenericXLogFinish(state);
UnlockReleaseBuffer(buffer);
}
/*
* (Re)initialize cached page in BloomBuildState.
*/
static void
initCachedPage(BloomBuildState *buildstate)
{
memset(buildstate->data, 0, BLCKSZ);
BloomInitPage(buildstate->data, 0);
buildstate->count = 0;
}
/*
* Per-tuple callback from IndexBuildHeapScan.
*/
static void
bloomBuildCallback(Relation index, HeapTuple htup, Datum *values,
bool *isnull, bool tupleIsAlive, void *state)
{
BloomBuildState *buildstate = (BloomBuildState *) state;
MemoryContext oldCtx;
BloomTuple *itup;
oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
itup = BloomFormTuple(&buildstate->blstate, &htup->t_self, values, isnull);
/* Try to add next item to cached page */
if (BloomPageAddItem(&buildstate->blstate, buildstate->data, itup))
{
/* Next item was added successfully */
buildstate->count++;
}
else
{
/* Cached page is full, flush it out and make a new one */
flushCachedPage(index, buildstate);
CHECK_FOR_INTERRUPTS();
initCachedPage(buildstate);
if (BloomPageAddItem(&buildstate->blstate, buildstate->data, itup) == false)
{
/* We shouldn't be here since we're inserting to the empty page */
elog(ERROR, "can not add new tuple");
}
}
MemoryContextSwitchTo(oldCtx);
MemoryContextReset(buildstate->tmpCtx);
}
/*
* Build a new bloom index.
*/
IndexBuildResult *
blbuild(Relation heap, Relation index, IndexInfo *indexInfo)
{
IndexBuildResult *result;
double reltuples;
BloomBuildState buildstate;
if (RelationGetNumberOfBlocks(index) != 0)
elog(ERROR, "index \"%s\" already contains data",
RelationGetRelationName(index));
/* Initialize the meta page */
BloomInitMetapage(index);
/* Initialize the bloom build state */
memset(&buildstate, 0, sizeof(buildstate));
initBloomState(&buildstate.blstate, index);
buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
"Bloom build temporary context",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
initCachedPage(&buildstate);
/* Do the heap scan */
reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
bloomBuildCallback, (void *) &buildstate);
/*
* There are could be some items in cached page. Flush this page
* if needed.
*/
if (buildstate.count > 0)
flushCachedPage(index, &buildstate);
MemoryContextDelete(buildstate.tmpCtx);
result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
result->heap_tuples = result->index_tuples = reltuples;
return result;
}
/*
* Build an empty bloom index in the initialization fork.
*/
void
blbuildempty(Relation index)
{
if (RelationGetNumberOfBlocks(index) != 0)
elog(ERROR, "index \"%s\" already contains data",
RelationGetRelationName(index));
/* Initialize the meta page */
BloomInitMetapage(index);
}
/*
* Insert new tuple to the bloom index.
*/
bool
blinsert(Relation index, Datum *values, bool *isnull,
ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique)
{
BloomState blstate;
BloomTuple *itup;
MemoryContext oldCtx;
MemoryContext insertCtx;
BloomMetaPageData *metaData;
Buffer buffer,
metaBuffer;
Page page,
metaPage;
BlockNumber blkno = InvalidBlockNumber;
OffsetNumber nStart;
GenericXLogState *state;
insertCtx = AllocSetContextCreate(CurrentMemoryContext,
"Bloom insert temporary context",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
oldCtx = MemoryContextSwitchTo(insertCtx);
initBloomState(&blstate, index);
itup = BloomFormTuple(&blstate, ht_ctid, values, isnull);
/*
* At first, try to insert new tuple to the first page in notFullPage
* array. If success we don't need to modify the meta page.
*/
metaBuffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO);
LockBuffer(metaBuffer, BUFFER_LOCK_SHARE);
metaData = BloomPageGetMeta(BufferGetPage(metaBuffer));
if (metaData->nEnd > metaData->nStart)
{
Page page;
blkno = metaData->notFullPage[metaData->nStart];
Assert(blkno != InvalidBlockNumber);
LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK);
buffer = ReadBuffer(index, blkno);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
state = GenericXLogStart(index);
page = GenericXLogRegister(state, buffer, false);
if (BloomPageAddItem(&blstate, page, itup))
{
GenericXLogFinish(state);
UnlockReleaseBuffer(buffer);
ReleaseBuffer(metaBuffer);
MemoryContextSwitchTo(oldCtx);
MemoryContextDelete(insertCtx);
return false;
}
else
{
GenericXLogAbort(state);
UnlockReleaseBuffer(buffer);
}
}
else
{
/* First page in notFullPage isn't suitable */
LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK);
}
/*
* Try other pages in notFullPage array. We will have to change nStart in
* metapage. Thus, grab exclusive lock on metapage.
*/
LockBuffer(metaBuffer, BUFFER_LOCK_EXCLUSIVE);
state = GenericXLogStart(index);
metaPage = GenericXLogRegister(state, metaBuffer, false);
metaData = BloomPageGetMeta(metaPage);
/*
* Iterate over notFullPage array. Skip page we already tried first.
*/
nStart = metaData->nStart;
if (metaData->nEnd > nStart &&
blkno == metaData->notFullPage[nStart])
nStart++;
while (metaData->nEnd > nStart)
{
blkno = metaData->notFullPage[nStart];
Assert(blkno != InvalidBlockNumber);
buffer = ReadBuffer(index, blkno);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
page = GenericXLogRegister(state, buffer, false);
if (BloomPageAddItem(&blstate, page, itup))
{
metaData->nStart = nStart;
GenericXLogFinish(state);
UnlockReleaseBuffer(buffer);
UnlockReleaseBuffer(metaBuffer);
MemoryContextSwitchTo(oldCtx);
MemoryContextDelete(insertCtx);
return false;
}
else
{
GenericXLogUnregister(state, buffer);
UnlockReleaseBuffer(buffer);
}
nStart++;
}
GenericXLogAbort(state);
/*
* Didn't find place to insert in notFullPage array. Allocate new page.
*/
buffer = BloomNewBuffer(index);
state = GenericXLogStart(index);
metaPage = GenericXLogRegister(state, metaBuffer, false);
metaData = BloomPageGetMeta(metaPage);
page = GenericXLogRegister(state, buffer, true);
BloomInitPage(page, 0);
BloomPageAddItem(&blstate, page, itup);
metaData->nStart = 0;
metaData->nEnd = 1;
metaData->notFullPage[0] = BufferGetBlockNumber(buffer);
GenericXLogFinish(state);
UnlockReleaseBuffer(buffer);
UnlockReleaseBuffer(metaBuffer);
return false;
}
CREATE OR REPLACE FUNCTION blhandler(internal)
RETURNS index_am_handler
AS 'MODULE_PATHNAME'
LANGUAGE C;
-- Access method
CREATE ACCESS METHOD bloom TYPE INDEX HANDLER blhandler;
-- Opclasses
CREATE OPERATOR CLASS int4_ops
DEFAULT FOR TYPE int4 USING bloom AS
OPERATOR 1 =(int4, int4),
FUNCTION 1 hashint4(int4);
CREATE OPERATOR CLASS text_ops
DEFAULT FOR TYPE text USING bloom AS
OPERATOR 1 =(text, text),
FUNCTION 1 hashtext(text);
# bloom extension
comment = 'bloom access method - signature file based index'
default_version = '1.0'
module_pathname = '$libdir/bloom'
relocatable = true
/*-------------------------------------------------------------------------
*
* bloom.h
* Header for bloom index.
*
* Copyright (c) 2016, PostgreSQL Global Development Group
*
* IDENTIFICATION
* contrib/bloom/bloom.h
*
*-------------------------------------------------------------------------
*/
#ifndef _BLOOM_H_
#define _BLOOM_H_
#include "access/amapi.h"
#include "access/generic_xlog.h"
#include "access/itup.h"
#include "access/xlog.h"
#include "nodes/relation.h"
#include "fmgr.h"
/* Support procedures numbers */
#define BLOOM_HASH_PROC 1
#define BLOOM_NPROC 1
/* Scan strategies */
#define BLOOM_EQUAL_STRATEGY 1
#define BLOOM_NSTRATEGIES 1
/* Opaque for bloom pages */
typedef struct BloomPageOpaqueData
{
OffsetNumber maxoff;
uint16 flags;
} BloomPageOpaqueData;
typedef BloomPageOpaqueData *BloomPageOpaque;
/* Bloom page flags */
#define BLOOM_META (1<<0)
#define BLOOM_DELETED (2<<0)
/* Macros for accessing bloom page structures */
#define BloomPageGetOpaque(page) ((BloomPageOpaque) PageGetSpecialPointer(page))
#define BloomPageGetMaxOffset(page) (BloomPageGetOpaque(page)->maxoff)
#define BloomPageIsMeta(page) (BloomPageGetOpaque(page)->flags & BLOOM_META)
#define BloomPageIsDeleted(page) (BloomPageGetOpaque(page)->flags & BLOOM_DELETED)
#define BloomPageSetDeleted(page) (BloomPageGetOpaque(page)->flags |= BLOOM_DELETED)
#define BloomPageSetNonDeleted(page) (BloomPageGetOpaque(page)->flags &= ~BLOOM_DELETED)
#define BloomPageGetData(page) ((BloomTuple *)PageGetContents(page))
#define BloomPageGetTuple(state, page, offset) \
((BloomTuple *)(PageGetContents(page) \
+ (state)->sizeOfBloomTuple * ((offset) - 1)))
#define BloomPageGetNextTuple(state, tuple) \
((BloomTuple *)((Pointer)(tuple) + (state)->sizeOfBloomTuple))
/* Preserved page numbers */
#define BLOOM_METAPAGE_BLKNO (0)
#define BLOOM_HEAD_BLKNO (1) /* first data page */
/* Bloom index options */
typedef struct BloomOptions
{
int32 vl_len_; /* varlena header (do not touch directly!) */
int bloomLength; /* length of signature in uint16 */
int bitSize[INDEX_MAX_KEYS]; /* signature bits per index
* key */
} BloomOptions;
/*
* FreeBlockNumberArray - array of block numbers sized so that metadata fill
* all space in metapage.
*/
typedef BlockNumber FreeBlockNumberArray[
MAXALIGN_DOWN(
BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(BloomPageOpaqueData))
- MAXALIGN(sizeof(uint16) * 2 + sizeof(uint32) + sizeof(BloomOptions))
) / sizeof(BlockNumber)
];
/* Metadata of bloom index */
typedef struct BloomMetaPageData
{
uint32 magickNumber;
uint16 nStart;
uint16 nEnd;
BloomOptions opts;
FreeBlockNumberArray notFullPage;
} BloomMetaPageData;
/* Magic number to distinguish bloom pages among anothers */
#define BLOOM_MAGICK_NUMBER (0xDBAC0DED)
/* Number of blocks numbers fit in BloomMetaPageData */
#define BloomMetaBlockN (sizeof(FreeBlockNumberArray) / sizeof(BlockNumber))
#define BloomPageGetMeta(page) ((BloomMetaPageData *) PageGetContents(page))
typedef struct BloomState
{
FmgrInfo hashFn[INDEX_MAX_KEYS];
BloomOptions *opts; /* stored in rd_amcache and defined at
* creation time */
int32 nColumns;
/*
* sizeOfBloomTuple is index's specific, and it depends on reloptions, so
* precompute it
*/
int32 sizeOfBloomTuple;
} BloomState;
#define BloomPageGetFreeSpace(state, page) \
(BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \
- BloomPageGetMaxOffset(page) * (state)->sizeOfBloomTuple \
- MAXALIGN(sizeof(BloomPageOpaqueData)))
/*
* Tuples are very different from all other relations
*/
typedef uint16 SignType;
typedef struct BloomTuple
{
ItemPointerData heapPtr;
SignType sign[1];
} BloomTuple;
#define BLOOMTUPLEHDRSZ offsetof(BloomTuple, sign)
/* Opaque data structure for bloom index scan */
typedef struct BloomScanOpaqueData
{
SignType *sign; /* Scan signature */
BloomState state;
} BloomScanOpaqueData;
typedef BloomScanOpaqueData *BloomScanOpaque;
/* blutils.c */
extern void _PG_init(void);
extern Datum blhandler(PG_FUNCTION_ARGS);
extern void initBloomState(BloomState * state, Relation index);
extern void BloomInitMetapage(Relation index);
extern void BloomInitPage(Page page, uint16 flags);
extern Buffer BloomNewBuffer(Relation index);
extern void signValue(BloomState * state, SignType * sign, Datum value, int attno);
extern BloomTuple *BloomFormTuple(BloomState * state, ItemPointer iptr, Datum *values, bool *isnull);
extern bool BloomPageAddItem(BloomState * state, Page page, BloomTuple * tuple);
/* blvalidate.c */
extern bool blvalidate(Oid opclassoid);
/* index access method interface functions */
extern bool blinsert(Relation index, Datum *values, bool *isnull,
ItemPointer ht_ctid, Relation heapRel,
IndexUniqueCheck checkUnique);
extern IndexScanDesc blbeginscan(Relation r, int nkeys, int norderbys);
extern int64 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
extern void blrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
ScanKey orderbys, int norderbys);
extern void blendscan(IndexScanDesc scan);
extern IndexBuildResult *blbuild(Relation heap, Relation index,
struct IndexInfo *indexInfo);
extern void blbuildempty(Relation index);
extern IndexBulkDeleteResult *blbulkdelete(IndexVacuumInfo *info,
IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback,
void *callback_state);
extern IndexBulkDeleteResult *blvacuumcleanup(IndexVacuumInfo *info,
IndexBulkDeleteResult *stats);
extern bytea *bloptions(Datum reloptions, bool validate);
extern void blcostestimate(PlannerInfo *root, IndexPath *path,
double loop_count, Cost *indexStartupCost,
Cost *indexTotalCost, Selectivity *indexSelectivity,
double *indexCorrelation);
#endif
/*-------------------------------------------------------------------------
*
* blscan.c
* Bloom index scan functions.
*
* Copyright (c) 2016, PostgreSQL Global Development Group
*
* IDENTIFICATION
* contrib/bloom/blscan.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/relscan.h"
#include "pgstat.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/lmgr.h"
#include "utils/memutils.h"
#include "utils/rel.h"
#include "bloom.h"
/*
* Begin scan of bloom index.
*/
IndexScanDesc
blbeginscan(Relation r, int nkeys, int norderbys)
{
IndexScanDesc scan;
scan = RelationGetIndexScan(r, nkeys, norderbys);
return scan;
}
/*
* Rescan a bloom index.
*/
void
blrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
ScanKey orderbys, int norderbys)
{
BloomScanOpaque so;
so = (BloomScanOpaque) scan->opaque;
if (so == NULL)
{
/* if called from blbeginscan */
so = (BloomScanOpaque) palloc(sizeof(BloomScanOpaqueData));
initBloomState(&so->state, scan->indexRelation);
scan->opaque = so;
}
else
{
if (so->sign)
pfree(so->sign);
}
so->sign = NULL;
if (scankey && scan->numberOfKeys > 0)
{
memmove(scan->keyData, scankey,
scan->numberOfKeys * sizeof(ScanKeyData));
}
}
/*
* End scan of bloom index.
*/
void
blendscan(IndexScanDesc scan)
{
BloomScanOpaque so = (BloomScanOpaque) scan->opaque;
if (so->sign)
pfree(so->sign);
so->sign = NULL;
}
/*
* Insert all matching tuples into to a bitmap.
*/
int64
blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
{
int64 ntids = 0;
BlockNumber blkno = BLOOM_HEAD_BLKNO,
npages;
int i;
BufferAccessStrategy bas;
BloomScanOpaque so = (BloomScanOpaque) scan->opaque;
if (so->sign == NULL && scan->numberOfKeys > 0)
{
/* New search: have to calculate search signature */
ScanKey skey = scan->keyData;
so->sign = palloc0(sizeof(SignType) * so->state.opts->bloomLength);
for (i = 0; i < scan->numberOfKeys; i++)
{
/*
* Assume bloom-indexable operators to be strict, so nothing could
* be found for NULL key.
*/
if (skey->sk_flags & SK_ISNULL)
{
pfree(so->sign);
so->sign = NULL;
return 0;
}
/* Add next value to the signature */
signValue(&so->state, so->sign, skey->sk_argument,
skey->sk_attno - 1);
skey++;
}
}
/*
* We're going to read the whole index. This is why we use appropriate
* buffer access strategy.
*/
bas = GetAccessStrategy(BAS_BULKREAD);
npages = RelationGetNumberOfBlocks(scan->indexRelation);
for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
{
Buffer buffer;
Page page;
buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM,
blkno, RBM_NORMAL, bas);
LockBuffer(buffer, BUFFER_LOCK_SHARE);
page = BufferGetPage(buffer);
if (!BloomPageIsDeleted(page))
{
OffsetNumber offset,
maxOffset = BloomPageGetMaxOffset(page);
for (offset = 1; offset <= maxOffset; offset++)
{
BloomTuple *itup = BloomPageGetTuple(&so->state, page, offset);
bool res = true;
/* Check index signature with scan signature */
for (i = 0; res && i < so->state.opts->bloomLength; i++)
{
if ((itup->sign[i] & so->sign[i]) != so->sign[i])
res = false;
}
/* Add matching tuples to bitmap */
if (res)
{
tbm_add_tuples(tbm, &itup->heapPtr, 1, true);
ntids++;
}
}
}
UnlockReleaseBuffer(buffer);
CHECK_FOR_INTERRUPTS();
}
FreeAccessStrategy(bas);
return ntids;
}
/*-------------------------------------------------------------------------
*
* blutils.c
* Bloom index utilities.
*
* Portions Copyright (c) 2016, PostgreSQL Global Development Group
* Portions Copyright (c) 1990-1993, Regents of the University of California
*
* IDENTIFICATION
* contrib/bloom/blutils.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/amapi.h"
#include "access/generic_xlog.h"
#include "catalog/index.h"
#include "storage/lmgr.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/indexfsm.h"
#include "utils/memutils.h"
#include "access/reloptions.h"
#include "storage/freespace.h"
#include "storage/indexfsm.h"
#include "bloom.h"
/* Signature dealing macros */
#define BITSIGNTYPE (BITS_PER_BYTE * sizeof(SignType))
#define GETWORD(x,i) ( *( (SignType*)(x) + (int)( (i) / BITSIGNTYPE ) ) )
#define CLRBIT(x,i) GETWORD(x,i) &= ~( 0x01 << ( (i) % BITSIGNTYPE ) )
#define SETBIT(x,i) GETWORD(x,i) |= ( 0x01 << ( (i) % BITSIGNTYPE ) )
#define GETBIT(x,i) ( (GETWORD(x,i) >> ( (i) % BITSIGNTYPE )) & 0x01 )
PG_FUNCTION_INFO_V1(blhandler);
/* Kind of relation optioms for bloom index */
static relopt_kind bl_relopt_kind;
static int32 myRand();
static void mySrand(uint32 seed);
/*
* Module initialize function: initilized relation options.
*/
void
_PG_init(void)
{
int i;
char buf[16];
bl_relopt_kind = add_reloption_kind();
add_int_reloption(bl_relopt_kind, "length",
"Length of signature in uint16 type", 5, 1, 256);
for (i = 0; i < INDEX_MAX_KEYS; i++)
{
snprintf(buf, 16, "col%d", i + 1);
add_int_reloption(bl_relopt_kind, buf,
"Number of bits for corresponding column", 2, 1, 2048);
}
}
/*
* Bloom handler function: return IndexAmRoutine with access method parameters
* and callbacks.
*/
Datum
blhandler(PG_FUNCTION_ARGS)
{
IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
amroutine->amstrategies = 1;
amroutine->amsupport = 1;
amroutine->amcanorder = false;
amroutine->amcanorderbyop = false;
amroutine->amcanbackward = false;
amroutine->amcanunique = false;
amroutine->amcanmulticol = true;
amroutine->amoptionalkey = true;
amroutine->amsearcharray = false;
amroutine->amsearchnulls = false;
amroutine->amstorage = false;
amroutine->amclusterable = false;
amroutine->ampredlocks = false;
amroutine->amkeytype = 0;
amroutine->aminsert = blinsert;
amroutine->ambeginscan = blbeginscan;
amroutine->amgettuple = NULL;
amroutine->amgetbitmap = blgetbitmap;
amroutine->amrescan = blrescan;
amroutine->amendscan = blendscan;
amroutine->ammarkpos = NULL;
amroutine->amrestrpos = NULL;
amroutine->ambuild = blbuild;
amroutine->ambuildempty = blbuildempty;
amroutine->ambulkdelete = blbulkdelete;
amroutine->amvacuumcleanup = blvacuumcleanup;
amroutine->amcanreturn = NULL;
amroutine->amcostestimate = blcostestimate;
amroutine->amoptions = bloptions;
amroutine->amvalidate = blvalidate;
PG_RETURN_POINTER(amroutine);
}
/*
* Fill BloomState structure for particular index.
*/
void
initBloomState(BloomState *state, Relation index)
{
int i;
state->nColumns = index->rd_att->natts;
/* Initialize hash function for each attribute */
for (i = 0; i < index->rd_att->natts; i++)
{
fmgr_info_copy(&(state->hashFn[i]),
index_getprocinfo(index, i + 1, BLOOM_HASH_PROC),
CurrentMemoryContext);
}
/* Initialize amcache if needed with options from metapage */
if (!index->rd_amcache)
{
Buffer buffer;
Page page;
BloomMetaPageData *meta;
BloomOptions *opts;
opts = MemoryContextAlloc(index->rd_indexcxt, sizeof(BloomOptions));
buffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO);
LockBuffer(buffer, BUFFER_LOCK_SHARE);
page = BufferGetPage(buffer);
if (!BloomPageIsMeta(page))
elog(ERROR, "Relation is not a bloom index");
meta = BloomPageGetMeta(BufferGetPage(buffer));
if (meta->magickNumber != BLOOM_MAGICK_NUMBER)
elog(ERROR, "Relation is not a bloom index");
*opts = meta->opts;
UnlockReleaseBuffer(buffer);
index->rd_amcache = (void *) opts;
}
state->opts = (BloomOptions *) index->rd_amcache;
state->sizeOfBloomTuple = BLOOMTUPLEHDRSZ +
sizeof(SignType) * state->opts->bloomLength;
}
/*
* Random generator copied from FreeBSD. Using own random generator here for
* two reasons:
*
* 1) In this case random numbers are used for on-disk storage. Usage of
* PostgreSQL number generator would obstruct it from all possible changes.
* 2) Changing seed of PostgreSQL random generator would be undesirable side
* effect.
*/
static int32 next;
static int32
myRand()
{
/*
* Compute x = (7^5 * x) mod (2^31 - 1)
* without overflowing 31 bits:
* (2^31 - 1) = 127773 * (7^5) + 2836
* From "Random number generators: good ones are hard to find",
* Park and Miller, Communications of the ACM, vol. 31, no. 10,
* October 1988, p. 1195.
*/
int32 hi, lo, x;
/* Must be in [1, 0x7ffffffe] range at this point. */
hi = next / 127773;
lo = next % 127773;
x = 16807 * lo - 2836 * hi;
if (x < 0)
x += 0x7fffffff;
next = x;
/* Transform to [0, 0x7ffffffd] range. */
return (x - 1);
}
void
mySrand(uint32 seed)
{
next = seed;
/* Transform to [1, 0x7ffffffe] range. */
next = (next % 0x7ffffffe) + 1;
}
/*
* Add bits of given value to the signature.
*/
void
signValue(BloomState *state, SignType *sign, Datum value, int attno)
{
uint32 hashVal;
int nBit,
j;
/*
* init generator with "column's" number to get "hashed" seed for new
* value. We don't want to map the same numbers from different columns
* into the same bits!
*/
mySrand(attno);
/*
* Init hash sequence to map our value into bits. the same values in
* different columns will be mapped into different bits because of step
* above
*/
hashVal = DatumGetInt32(FunctionCall1(&state->hashFn[attno], value));
mySrand(hashVal ^ myRand());
for (j = 0; j < state->opts->bitSize[attno]; j++)
{
/* prevent mutiple evaluation */
nBit = myRand() % (state->opts->bloomLength * BITSIGNTYPE);
SETBIT(sign, nBit);
}
}
/*
* Make bloom tuple from values.
*/
BloomTuple *
BloomFormTuple(BloomState *state, ItemPointer iptr, Datum *values, bool *isnull)
{
int i;
BloomTuple *res = (BloomTuple *) palloc0(state->sizeOfBloomTuple);
res->heapPtr = *iptr;
/* Blooming each column */
for (i = 0; i < state->nColumns; i++)
{
/* skip nulls */
if (isnull[i])
continue;
signValue(state, res->sign, values[i], i);
}
return res;
}
/*
* Add new bloom tuple to the page. Returns true if new tuple was successfully
* added to the page. Returns false if it doesn't git the page.
*/
bool
BloomPageAddItem(BloomState *state, Page page, BloomTuple *tuple)
{
BloomTuple *itup;
BloomPageOpaque opaque;
Pointer ptr;
/* Does new tuple fit the page */
if (BloomPageGetFreeSpace(state, page) < state->sizeOfBloomTuple)
return false;
/* Copy new tuple to the end of page */
opaque = BloomPageGetOpaque(page);
itup = BloomPageGetTuple(state, page, opaque->maxoff + 1);
memcpy((Pointer) itup, (Pointer) tuple, state->sizeOfBloomTuple);
/* Adjust maxoff and pd_lower */
opaque->maxoff++;
ptr = (Pointer) BloomPageGetTuple(state, page, opaque->maxoff + 1);
((PageHeader) page)->pd_lower = ptr - page;
return true;
}
/*
* Allocate a new page (either by recycling, or by extending the index file)
* The returned buffer is already pinned and exclusive-locked
* Caller is responsible for initializing the page by calling BloomInitBuffer
*/
Buffer
BloomNewBuffer(Relation index)
{
Buffer buffer;
bool needLock;
/* First, try to get a page from FSM */
for (;;)
{
BlockNumber blkno = GetFreeIndexPage(index);
if (blkno == InvalidBlockNumber)
break;
buffer = ReadBuffer(index, blkno);
/*
* We have to guard against the possibility that someone else already
* recycled this page; the buffer may be locked if so.
*/
if (ConditionalLockBuffer(buffer))
{
Page page = BufferGetPage(buffer);
if (PageIsNew(page))
return buffer; /* OK to use, if never initialized */
if (BloomPageIsDeleted(page))
return buffer; /* OK to use */
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
}
/* Can't use it, so release buffer and try again */
ReleaseBuffer(buffer);
}
/* Must extend the file */
needLock = !RELATION_IS_LOCAL(index);
if (needLock)
LockRelationForExtension(index, ExclusiveLock);
buffer = ReadBuffer(index, P_NEW);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
if (needLock)
UnlockRelationForExtension(index, ExclusiveLock);
return buffer;
}
/*
* Initialize bloom page.
*/
void
BloomInitPage(Page page, uint16 flags)
{
BloomPageOpaque opaque;
PageInit(page, BLCKSZ, sizeof(BloomPageOpaqueData));
opaque = BloomPageGetOpaque(page);
memset(opaque, 0, sizeof(BloomPageOpaqueData));
opaque->flags = flags;
}
/*
* Adjust options of bloom index.
*/
static void
adjustBloomOptions(BloomOptions *opts)
{
int i;
/* Default length of bloom filter is 5 of 16-bit integers */
if (opts->bloomLength <= 0)
opts->bloomLength = 5;
else
opts->bloomLength = opts->bloomLength;
/* Check singnature length */
for (i = 0; i < INDEX_MAX_KEYS; i++)
{
/*
* Zero and negative number of bits is meaningless. Also setting
* more bits than signature have seems useless. Replace both cases
* with 2 bits default.
*/
if (opts->bitSize[i] <= 0
|| opts->bitSize[i] >= opts->bloomLength * sizeof(SignType))
opts->bitSize[i] = 2;
}
}
/*
* Initialize metapage for bloom index.
*/
void
BloomInitMetapage(Relation index)
{
Page metaPage;
Buffer metaBuffer;
BloomMetaPageData *metadata;
GenericXLogState *state;
/*
* Make a new buffer, since it first buffer it should be associated with
* block number 0 (BLOOM_METAPAGE_BLKNO).
*/
metaBuffer = BloomNewBuffer(index);
Assert(BufferGetBlockNumber(metaBuffer) == BLOOM_METAPAGE_BLKNO);
/* Initialize bloom index options */
if (!index->rd_options)
index->rd_options = palloc0(sizeof(BloomOptions));
adjustBloomOptions((BloomOptions *) index->rd_options);
/* Initialize contents of meta page */
state = GenericXLogStart(index);
metaPage = GenericXLogRegister(state, metaBuffer, true);
BloomInitPage(metaPage, BLOOM_META);
metadata = BloomPageGetMeta(metaPage);
memset(metadata, 0, sizeof(BloomMetaPageData));
metadata->magickNumber = BLOOM_MAGICK_NUMBER;
metadata->opts = *((BloomOptions *) index->rd_options);
((PageHeader) metaPage)->pd_lower += sizeof(BloomMetaPageData);
GenericXLogFinish(state);
UnlockReleaseBuffer(metaBuffer);
}
/*
* Initialize options for bloom index.
*/
bytea *
bloptions(Datum reloptions, bool validate)
{
relopt_value *options;
int numoptions;
BloomOptions *rdopts;
relopt_parse_elt tab[INDEX_MAX_KEYS + 1];
int i;
char buf[16];
/* Option for length of signature */
tab[0].optname = "length";
tab[0].opttype = RELOPT_TYPE_INT;
tab[0].offset = offsetof(BloomOptions, bloomLength);
/* Number of bits for each of possible columns: col1, col2, ... */
for (i = 0; i < INDEX_MAX_KEYS; i++)
{
snprintf(buf, sizeof(buf), "col%d", i + 1);
tab[i + 1].optname = pstrdup(buf);
tab[i + 1].opttype = RELOPT_TYPE_INT;
tab[i + 1].offset = offsetof(BloomOptions, bitSize[i]);
}
options = parseRelOptions(reloptions, validate, bl_relopt_kind, &numoptions);
rdopts = allocateReloptStruct(sizeof(BloomOptions), options, numoptions);
fillRelOptions((void *) rdopts, sizeof(BloomOptions), options, numoptions,
validate, tab, INDEX_MAX_KEYS + 1);
adjustBloomOptions(rdopts);
return (bytea *) rdopts;
}
/*-------------------------------------------------------------------------
*
* blvacuum.c
* Bloom VACUUM functions.
*
* Copyright (c) 2016, PostgreSQL Global Development Group
*
* IDENTIFICATION
* contrib/bloom/blvacuum.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/genam.h"
#include "catalog/storage.h"
#include "commands/vacuum.h"
#include "miscadmin.h"
#include "postmaster/autovacuum.h"
#include "storage/bufmgr.h"
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
#include "bloom.h"
/*
* Bulk deletion of all index entries pointing to a set of heap tuples.
* The set of target tuples is specified via a callback routine that tells
* whether any given heap tuple (identified by ItemPointer) is being deleted.
*
* Result: a palloc'd struct containing statistical info for VACUUM displays.
*/
IndexBulkDeleteResult *
blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
IndexBulkDeleteCallback callback, void *callback_state)
{
Relation index = info->index;
BlockNumber blkno,
npages;
FreeBlockNumberArray notFullPage;
int countPage = 0;
BloomState state;
Buffer buffer;
Page page;
GenericXLogState *gxlogState;
if (stats == NULL)
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
initBloomState(&state, index);
/*
* Interate over the pages. We don't care about concurrently added pages,
* they can't contain tuples to delete.
*/
npages = RelationGetNumberOfBlocks(index);
for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
{
BloomTuple *itup,
*itupPtr,
*itupEnd;
buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
RBM_NORMAL, info->strategy);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
gxlogState = GenericXLogStart(index);
page = GenericXLogRegister(gxlogState, buffer, false);
if (BloomPageIsDeleted(page))
{
UnlockReleaseBuffer(buffer);
CHECK_FOR_INTERRUPTS();
continue;
}
/* Iterate over the tuples */
itup = BloomPageGetTuple(&state, page, 1);
itupPtr = BloomPageGetTuple(&state, page, 1);
itupEnd = BloomPageGetTuple(&state, page, BloomPageGetMaxOffset(page) + 1);
while (itup < itupEnd)
{
/* Do we have to delete this tuple? */
if (callback(&itup->heapPtr, callback_state))
{
stats->tuples_removed += 1;
BloomPageGetOpaque(page)->maxoff--;
}
else
{
if (itupPtr != itup)
{
/*
* If we already delete something before, we have to move
* this tuple backward.
*/
memmove((Pointer) itupPtr, (Pointer) itup,
state.sizeOfBloomTuple);
}
stats->num_index_tuples++;
itupPtr = BloomPageGetNextTuple(&state, itupPtr);
}
itup = BloomPageGetNextTuple(&state, itup);
}
Assert(itupPtr == BloomPageGetTuple(&state, page, BloomPageGetMaxOffset(page) + 1));
if (!BloomPageIsDeleted(page) &&
BloomPageGetFreeSpace(&state, page) > state.sizeOfBloomTuple &&
countPage < BloomMetaBlockN)
notFullPage[countPage++] = blkno;
/* Did we delete something? */
if (itupPtr != itup)
{
/* Is it empty page now? */
if (itupPtr == BloomPageGetData(page))
BloomPageSetDeleted(page);
/* Adjust pg_lower */
((PageHeader) page)->pd_lower = (Pointer) itupPtr - page;
/* Finish WAL-logging */
GenericXLogFinish(gxlogState);
}
else
{
/* Didn't change anything: abort WAL-logging */
GenericXLogAbort(gxlogState);
}
UnlockReleaseBuffer(buffer);
CHECK_FOR_INTERRUPTS();
}
if (countPage > 0)
{
BloomMetaPageData *metaData;
buffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
gxlogState = GenericXLogStart(index);
page = GenericXLogRegister(gxlogState, buffer, false);
metaData = BloomPageGetMeta(page);
memcpy(metaData->notFullPage, notFullPage, sizeof(FreeBlockNumberArray));
metaData->nStart = 0;
metaData->nEnd = countPage;
GenericXLogFinish(gxlogState);
UnlockReleaseBuffer(buffer);
}
return stats;
}
/*
* Post-VACUUM cleanup.
*
* Result: a palloc'd struct containing statistical info for VACUUM displays.
*/
IndexBulkDeleteResult *
blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
{
Relation index = info->index;
BlockNumber npages,
blkno;
BlockNumber totFreePages;
if (info->analyze_only)
return stats;
if (stats == NULL)
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
/*
* Iterate over the pages: insert deleted pages into FSM and collect
* statistics.
*/
npages = RelationGetNumberOfBlocks(index);
totFreePages = 0;
for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
{
Buffer buffer;
Page page;
vacuum_delay_point();
buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
RBM_NORMAL, info->strategy);
LockBuffer(buffer, BUFFER_LOCK_SHARE);
page = (Page) BufferGetPage(buffer);
if (BloomPageIsDeleted(page))
{
RecordFreeIndexPage(index, blkno);
totFreePages++;
}
else
{
stats->num_index_tuples += BloomPageGetMaxOffset(page);
stats->estimated_count += BloomPageGetMaxOffset(page);
}
UnlockReleaseBuffer(buffer);
}
IndexFreeSpaceMapVacuum(info->index);
stats->pages_free = totFreePages;
stats->num_pages = RelationGetNumberOfBlocks(index);
return stats;
}
/*-------------------------------------------------------------------------
*
* blvalidate.c
* Opclass validator for bloom.
*
* Copyright (c) 2016, PostgreSQL Global Development Group
*
* IDENTIFICATION
* contrib/bloom/blvalidate.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/amvalidate.h"
#include "access/htup_details.h"
#include "catalog/pg_amop.h"
#include "catalog/pg_amproc.h"
#include "catalog/pg_opclass.h"
#include "catalog/pg_opfamily.h"
#include "catalog/pg_type.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/syscache.h"
#include "bloom.h"
/*
* Validator for a bloom opclass.
*/
bool
blvalidate(Oid opclassoid)
{
bool result = true;
HeapTuple classtup;
Form_pg_opclass classform;
Oid opfamilyoid;
Oid opcintype;
Oid opckeytype;
char *opclassname;
HeapTuple familytup;
Form_pg_opfamily familyform;
char *opfamilyname;
CatCList *proclist,
*oprlist;
List *grouplist;
OpFamilyOpFuncGroup *opclassgroup;
int i;
ListCell *lc;
/* Fetch opclass information */
classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid));
if (!HeapTupleIsValid(classtup))
elog(ERROR, "cache lookup failed for operator class %u", opclassoid);
classform = (Form_pg_opclass) GETSTRUCT(classtup);
opfamilyoid = classform->opcfamily;
opcintype = classform->opcintype;
opckeytype = classform->opckeytype;
if (!OidIsValid(opckeytype))
opckeytype = opcintype;
opclassname = NameStr(classform->opcname);
/* Fetch opfamily information */
familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid));
if (!HeapTupleIsValid(familytup))
elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid);
familyform = (Form_pg_opfamily) GETSTRUCT(familytup);
opfamilyname = NameStr(familyform->opfname);
/* Fetch all operators and support functions of the opfamily */
oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid));
proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid));
/* Check individual support functions */
for (i = 0; i < proclist->n_members; i++)
{
HeapTuple proctup = &proclist->members[i]->tuple;
Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup);
bool ok;
/*
* All bloom support functions should be registered with matching
* left/right types
*/
if (procform->amproclefttype != procform->amprocrighttype)
{
ereport(INFO,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("bloom opfamily %s contains support procedure %s with cross-type registration",
opfamilyname,
format_procedure(procform->amproc))));
result = false;
}
/*
* We can't check signatures except within the specific opclass, since
* we need to know the associated opckeytype in many cases.
*/
if (procform->amproclefttype != opcintype)
continue;
/* Check procedure numbers and function signatures */
switch (procform->amprocnum)
{
case BLOOM_HASH_PROC:
ok = check_amproc_signature(procform->amproc, INT4OID, false,
1, 1, opckeytype);
break;
default:
ereport(INFO,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("bloom opfamily %s contains function %s with invalid support number %d",
opfamilyname,
format_procedure(procform->amproc),
procform->amprocnum)));
result = false;
continue; /* don't want additional message */
}
if (!ok)
{
ereport(INFO,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("gist opfamily %s contains function %s with wrong signature for support number %d",
opfamilyname,
format_procedure(procform->amproc),
procform->amprocnum)));
result = false;
}
}
/* Check individual operators */
for (i = 0; i < oprlist->n_members; i++)
{
HeapTuple oprtup = &oprlist->members[i]->tuple;
Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup);
/* Check it's allowed strategy for bloom */
if (oprform->amopstrategy < 1 ||
oprform->amopstrategy > BLOOM_NSTRATEGIES)
{
ereport(INFO,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("bloom opfamily %s contains operator %s with invalid strategy number %d",
opfamilyname,
format_operator(oprform->amopopr),
oprform->amopstrategy)));
result = false;
}
/* bloom doesn't support ORDER BY operators */
if (oprform->amoppurpose != AMOP_SEARCH ||
OidIsValid(oprform->amopsortfamily))
{
ereport(INFO,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("bloom opfamily %s contains invalid ORDER BY specification for operator %s",
opfamilyname,
format_operator(oprform->amopopr))));
result = false;
}
/* Check operator signature --- same for all bloom strategies */
if (!check_amop_signature(oprform->amopopr, BOOLOID,
oprform->amoplefttype,
oprform->amoprighttype))
{
ereport(INFO,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("bloom opfamily %s contains operator %s with wrong signature",
opfamilyname,
format_operator(oprform->amopopr))));
result = false;
}
}
/* Now check for inconsistent groups of operators/functions */
grouplist = identify_opfamily_groups(oprlist, proclist);
opclassgroup = NULL;
foreach(lc, grouplist)
{
OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc);
/* Remember the group exactly matching the test opclass */
if (thisgroup->lefttype == opcintype &&
thisgroup->righttype == opcintype)
opclassgroup = thisgroup;
/*
* There is not a lot we can do to check the operator sets, since each
* bloom opclass is more or less a law unto itself, and some contain
* only operators that are binary-compatible with the opclass datatype
* (meaning that empty operator sets can be OK). That case also means
* that we shouldn't insist on nonempty function sets except for the
* opclass's own group.
*/
}
/* Check that the originally-named opclass is complete */
for (i = 1; i <= BLOOM_NPROC; i++)
{
if (opclassgroup &&
(opclassgroup->functionset & (((uint64) 1) << i)) != 0)
continue; /* got it */
ereport(INFO,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("bloom opclass %s is missing support function %d",
opclassname, i)));
result = false;
}
ReleaseCatCacheList(proclist);
ReleaseCatCacheList(oprlist);
ReleaseSysCache(familytup);
ReleaseSysCache(classtup);
return result;
}
CREATE EXTENSION bloom;
CREATE TABLE tst (
i int4,
t text
);
INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i;
CREATE INDEX bloomidx ON tst USING bloom (i, t) WITH (col1 = 3);
SET enable_seqscan=on;
SET enable_bitmapscan=off;
SET enable_indexscan=off;
SELECT count(*) FROM tst WHERE i = 7;
count
-------
10000
(1 row)
SELECT count(*) FROM tst WHERE t = '5';
count
-------
6264
(1 row)
SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
count
-------
588
(1 row)
SET enable_seqscan=off;
SET enable_bitmapscan=on;
SET enable_indexscan=on;
EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7;
QUERY PLAN
-------------------------------------------
Aggregate
-> Bitmap Heap Scan on tst
Recheck Cond: (i = 7)
-> Bitmap Index Scan on bloomidx
Index Cond: (i = 7)
(5 rows)
EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE t = '5';
QUERY PLAN
-------------------------------------------
Aggregate
-> Bitmap Heap Scan on tst
Recheck Cond: (t = '5'::text)
-> Bitmap Index Scan on bloomidx
Index Cond: (t = '5'::text)
(5 rows)
EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
QUERY PLAN
---------------------------------------------------------
Aggregate
-> Bitmap Heap Scan on tst
Recheck Cond: ((i = 7) AND (t = '5'::text))
-> Bitmap Index Scan on bloomidx
Index Cond: ((i = 7) AND (t = '5'::text))
(5 rows)
SELECT count(*) FROM tst WHERE i = 7;
count
-------
10000
(1 row)
SELECT count(*) FROM tst WHERE t = '5';
count
-------
6264
(1 row)
SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
count
-------
588
(1 row)
DELETE FROM tst;
INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i;
VACUUM ANALYZE tst;
SELECT count(*) FROM tst WHERE i = 7;
count
-------
10000
(1 row)
SELECT count(*) FROM tst WHERE t = '5';
count
-------
6264
(1 row)
SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
count
-------
588
(1 row)
VACUUM FULL tst;
SELECT count(*) FROM tst WHERE i = 7;
count
-------
10000
(1 row)
SELECT count(*) FROM tst WHERE t = '5';
count
-------
6264
(1 row)
SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
count
-------
588
(1 row)
RESET enable_seqscan;
RESET enable_bitmapscan;
RESET enable_indexscan;
CREATE EXTENSION bloom;
CREATE TABLE tst (
i int4,
t text
);
INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i;
CREATE INDEX bloomidx ON tst USING bloom (i, t) WITH (col1 = 3);
SET enable_seqscan=on;
SET enable_bitmapscan=off;
SET enable_indexscan=off;
SELECT count(*) FROM tst WHERE i = 7;
SELECT count(*) FROM tst WHERE t = '5';
SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
SET enable_seqscan=off;
SET enable_bitmapscan=on;
SET enable_indexscan=on;
EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7;
EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE t = '5';
EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
SELECT count(*) FROM tst WHERE i = 7;
SELECT count(*) FROM tst WHERE t = '5';
SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
DELETE FROM tst;
INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i;
VACUUM ANALYZE tst;
SELECT count(*) FROM tst WHERE i = 7;
SELECT count(*) FROM tst WHERE t = '5';
SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
VACUUM FULL tst;
SELECT count(*) FROM tst WHERE i = 7;
SELECT count(*) FROM tst WHERE t = '5';
SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
RESET enable_seqscan;
RESET enable_bitmapscan;
RESET enable_indexscan;
# Test generic xlog record work for bloom index replication.
use strict;
use warnings;
use PostgresNode;
use TestLib;
use Test::More tests => 31;
my $node_master;
my $node_standby;
# Run few queries on both master and standby and check their results match.
sub test_index_replay
{
my ($test_name) = @_;
# Wait for standby to catch up
my $applname = $node_standby->name;
my $caughtup_query =
"SELECT pg_current_xlog_location() <= write_location FROM pg_stat_replication WHERE application_name = '$applname';";
$node_master->poll_query_until('postgres', $caughtup_query)
or die "Timed out while waiting for standby 1 to catch up";
my $queries = qq(SET enable_seqscan=off;
SET enable_bitmapscan=on;
SET enable_indexscan=on;
SELECT * FROM tst WHERE i = 0;
SELECT * FROM tst WHERE i = 3;
SELECT * FROM tst WHERE t = 'b';
SELECT * FROM tst WHERE t = 'f';
SELECT * FROM tst WHERE i = 3 AND t = 'c';
SELECT * FROM tst WHERE i = 7 AND t = 'e';
);
# Run test queries and compare their result
my $master_result = $node_master->psql("postgres", $queries);
my $standby_result = $node_standby->psql("postgres", $queries);
is($master_result, $standby_result, "$test_name: query result matches");
}
# Initialize master node
$node_master = get_new_node('master');
$node_master->init(allows_streaming => 1);
$node_master->start;
my $backup_name = 'my_backup';
# Take backup
$node_master->backup($backup_name);
# Create streaming standby linking to master
$node_standby = get_new_node('standby');
$node_standby->init_from_backup($node_master, $backup_name,
has_streaming => 1);
$node_standby->start;
# Create some bloom index on master
$node_master->psql("postgres", "CREATE EXTENSION bloom;");
$node_master->psql("postgres", "CREATE TABLE tst (i int4, t text);");
$node_master->psql("postgres", "INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i;");
$node_master->psql("postgres", "CREATE INDEX bloomidx ON tst USING bloom (i, t) WITH (col1 = 3);");
# Test that queries give same result
test_index_replay('initial');
# Run 10 cycles of table modification. Run test queries after each modification.
for my $i (1..10)
{
$node_master->psql("postgres", "DELETE FROM tst WHERE i = $i;");
test_index_replay("delete $i");
$node_master->psql("postgres", "VACUUM tst;");
test_index_replay("vacuum $i");
my ($start, $end) = (100001 + ($i - 1) * 10000, 100000 + $i * 10000);
$node_master->psql("postgres", "INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series($start,$end) i;");
test_index_replay("insert $i");
}
<!-- doc/src/sgml/bloom.sgml -->
<sect1 id="bloom" xreflabel="bloom">
<title>bloom</title>
<indexterm zone="bloom">
<primary>bloom</primary>
</indexterm>
<para>
<literal>bloom</> is a contrib which implements index access method. It comes
as example of custom access methods and generic WAL records usage. But it
is also useful itself.
</para>
<sect2>
<title>Introduction</title>
<para>
Implementation of
<ulink url="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</ulink>
allows fast exclusion of non-candidate tuples.
Since signature is a lossy representation of all indexed attributes,
search results should be rechecked using heap information.
User can specify signature length (in uint16, default is 5) and the number of
bits, which can be setted, per attribute (1 < colN < 2048).
</para>
<para>
This index is useful if table has many attributes and queries can include
their arbitary combinations. Traditional <literal>btree</> index is faster
than bloom index, but it'd require too many indexes to support all possible
queries, while one need only one bloom index. Bloom index supports only
equality comparison. Since it's a signature file, not a tree, it always
should be readed fully, but sequentially, so index search performance is
constant and doesn't depend on a query.
</para>
</sect2>
<sect2>
<title>Parameters</title>
<para>
<literal>bloom</> indexes accept following parameters in <literal>WITH</>
clause.
</para>
<variablelist>
<varlistentry>
<term><literal>length</></term>
<listitem>
<para>
Length of signature in uint16 type values
</para>
</listitem>
</varlistentry>
</variablelist>
<variablelist>
<varlistentry>
<term><literal>col1 &mdash; col16</></term>
<listitem>
<para>
Number of bits for corresponding column
</para>
</listitem>
</varlistentry>
</variablelist>
</sect2>
<sect2>
<title>Examples</title>
<para>
Example of index definition is given below.
</para>
<programlisting>
CREATE INDEX bloomidx ON tbloom(i1,i2,i3)
WITH (length=5, col1=2, col2=2, col3=4);
</programlisting>
<para>
Here, we create bloom index with signature length 80 bits and attributes
i1, i2 mapped to 2 bits, attribute i3 - to 4 bits.
</para>
<para>
Example of index definition and usage is given below.
</para>
<programlisting>
CREATE TABLE tbloom AS
SELECT
random()::int as i1,
random()::int as i2,
random()::int as i3,
random()::int as i4,
random()::int as i5,
random()::int as i6,
random()::int as i7,
random()::int as i8,
random()::int as i9,
random()::int as i10,
random()::int as i11,
random()::int as i12,
random()::int as i13
FROM
generate_series(1,1000);
CREATE INDEX bloomidx ON tbloom USING
bloom (i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12);
SELECT pg_relation_size('bloomidx');
CREATE index btree_idx ON tbloom(i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12);
SELECT pg_relation_size('btree_idx');
</programlisting>
<programlisting>
=# EXPLAIN ANALYZE SELECT * FROM tbloom WHERE i2 = 20 AND i10 = 15;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on tbloom (cost=1.50..5.52 rows=1 width=52) (actual time=0.057..0.057 rows=0 loops=1)
Recheck Cond: ((i2 = 20) AND (i10 = 15))
-> Bitmap Index Scan on bloomidx (cost=0.00..1.50 rows=1 width=0) (actual time=0.041..0.041 rows=9 loops=1)
Index Cond: ((i2 = 20) AND (i10 = 15))
Total runtime: 0.081 ms
(5 rows)
</programlisting>
<para>
Seqscan is slow.
</para>
<programlisting>
=# SET enable_bitmapscan = off;
=# SET enable_indexscan = off;
=# EXPLAIN ANALYZE SELECT * FROM tbloom WHERE i2 = 20 AND i10 = 15;
QUERY PLAN
--------------------------------------------------------------------------------------------------
Seq Scan on tbloom (cost=0.00..25.00 rows=1 width=52) (actual time=0.162..0.162 rows=0 loops=1)
Filter: ((i2 = 20) AND (i10 = 15))
Total runtime: 0.181 ms
(3 rows)
</programlisting>
<para>
Btree index will be not used for this query.
</para>
<programlisting>
=# DROP INDEX bloomidx;
=# CREATE INDEX btree_idx ON tbloom(i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12);
=# EXPLAIN ANALYZE SELECT * FROM tbloom WHERE i2 = 20 AND i10 = 15;
QUERY PLAN
--------------------------------------------------------------------------------------------------
Seq Scan on tbloom (cost=0.00..25.00 rows=1 width=52) (actual time=0.210..0.210 rows=0 loops=1)
Filter: ((i2 = 20) AND (i10 = 15))
Total runtime: 0.250 ms
(3 rows)
</programlisting>
</sect2>
<sect2>
<title>Opclass interface</title>
<para>
Bloom opclass interface is simple. It requires 1 supporting function:
hash function for indexing datatype. And it provides 1 search operator:
equality operator. The example below shows <literal>opclass</> definition
for <literal>text</> datatype.
</para>
<programlisting>
CREATE OPERATOR CLASS text_ops
DEFAULT FOR TYPE text USING bloom AS
OPERATOR 1 =(text, text),
FUNCTION 1 hashtext(text);
</programlisting>
</sect2>
<sect2>
<title>Limitation</title>
<para>
<itemizedlist>
<listitem>
<para>
For now, only opclasses for <literal>int4</>, <literal>text</> comes
with contrib. However, users may define more of them.
</para>
</listitem>
<listitem>
<para>
Only <literal>=</literal> operator is supported for search now. But it's
possible to add support of arrays with contains and intersection
operations in future.
</para>
</listitem>
</itemizedlist>
</para>
</sect2>
<sect2>
<title>Authors</title>
<para>
Teodor Sigaev <email>teodor@postgrespro.ru</email>, Postgres Professional, Moscow, Russia
</para>
<para>
Alexander Korotkov <email>a.korotkov@postgrespro.ru</email>, Postgres Professional, Moscow, Russia
</para>
<para>
Oleg Bartunov <email>obartunov@postgrespro.ru</email>, Postgres Professional, Moscow, Russia
</para>
</sect2>
</sect1>
......@@ -105,6 +105,7 @@ CREATE EXTENSION <replaceable>module_name</> FROM unpackaged;
&adminpack;
&auth-delay;
&auto-explain;
&bloom;
&btree-gin;
&btree-gist;
&chkpass;
......
......@@ -107,6 +107,7 @@
<!ENTITY adminpack SYSTEM "adminpack.sgml">
<!ENTITY auth-delay SYSTEM "auth-delay.sgml">
<!ENTITY auto-explain SYSTEM "auto-explain.sgml">
<!ENTITY bloom SYSTEM "bloom.sgml">
<!ENTITY btree-gin SYSTEM "btree-gin.sgml">
<!ENTITY btree-gist SYSTEM "btree-gist.sgml">
<!ENTITY chkpass SYSTEM "chkpass.sgml">
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment