Commit 0454f131 authored by Tom Lane's avatar Tom Lane

Rewrite the rbtree routines so that an RBNode is the first field of the

struct representing a tree entry, rather than being a separately allocated
piece of storage.  This API is at least as clean as the old one (if not
more so --- there were some bizarre choices in there) and it permits a
very substantial memory savings, on the order of 2X in ginbulk.c's usage.

Also, fix minor memory leaks in code called by ginEntryInsert, in
particular in ginInsertValue and entryFillRoot, as well as ginEntryInsert
itself.  These leaks resulted in the GIN index build context continuing
to bloat even after we'd filled it to maintenance_work_mem and started
to dump data out to the index.

In combination these fixes restore the GIN index build code to honoring
the maintenance_work_mem limit about as well as it did in 8.4.  Speed
seems on par with 8.4 too, maybe even a bit faster, for a non-pathological
case in which HEAD was formerly slower.

Back-patch to 9.0 so we don't have a performance regression from 8.4.
parent afc2900f
......@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/gin/ginbtree.c,v 1.15 2010/01/02 16:57:33 momjian Exp $
* $PostgreSQL: pgsql/src/backend/access/gin/ginbtree.c,v 1.16 2010/08/01 02:12:42 tgl Exp $
*-------------------------------------------------------------------------
*/
......@@ -267,6 +267,8 @@ findParents(GinBtree btree, GinBtreeStack *stack,
/*
* Insert value (stored in GinBtree) to tree described by stack
*
* NB: the passed-in stack is freed, as though by freeGinBtreeStack.
*/
void
ginInsertValue(GinBtree btree, GinBtreeStack *stack)
......@@ -308,10 +310,11 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack)
PageSetTLI(page, ThisTimeLineID);
}
UnlockReleaseBuffer(stack->buffer);
LockBuffer(stack->buffer, GIN_UNLOCK);
END_CRIT_SECTION();
freeGinBtreeStack(stack->parent);
freeGinBtreeStack(stack);
return;
}
else
......@@ -325,7 +328,6 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack)
*/
newlpage = btree->splitPage(btree, stack->buffer, rbuffer, stack->off, &rdata);
((ginxlogSplit *) (rdata->data))->rootBlkno = rootBlkno;
parent = stack->parent;
......@@ -341,7 +343,6 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack)
((ginxlogSplit *) (rdata->data))->isRootSplit = TRUE;
((ginxlogSplit *) (rdata->data))->rrlink = InvalidBlockNumber;
page = BufferGetPage(stack->buffer);
lpage = BufferGetPage(lbuffer);
rpage = BufferGetPage(rbuffer);
......@@ -375,10 +376,11 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack)
UnlockReleaseBuffer(rbuffer);
UnlockReleaseBuffer(lbuffer);
UnlockReleaseBuffer(stack->buffer);
LockBuffer(stack->buffer, GIN_UNLOCK);
END_CRIT_SECTION();
freeGinBtreeStack(stack);
return;
}
else
......
......@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/gin/ginbulk.c,v 1.19 2010/02/26 02:00:33 momjian Exp $
* $PostgreSQL: pgsql/src/backend/access/gin/ginbulk.c,v 1.20 2010/08/01 02:12:42 tgl Exp $
*-------------------------------------------------------------------------
*/
......@@ -19,17 +19,21 @@
#include "utils/memutils.h"
#define DEF_NENTRY 2048
#define DEF_NPTR 4
#define DEF_NENTRY 2048 /* EntryAccumulator allocation quantum */
#define DEF_NPTR 5 /* ItemPointer initial allocation quantum */
static void *
ginAppendData(void *old, void *new, void *arg)
{
EntryAccumulator *eo = (EntryAccumulator *) old,
*en = (EntryAccumulator *) new;
/* Combiner function for rbtree.c */
static void
ginCombineData(RBNode *existing, const RBNode *newdata, void *arg)
{
EntryAccumulator *eo = (EntryAccumulator *) existing;
const EntryAccumulator *en = (const EntryAccumulator *) newdata;
BuildAccumulator *accum = (BuildAccumulator *) arg;
/*
* Note this code assumes that newdata contains only one itempointer.
*/
if (eo->number >= eo->length)
{
accum->allocatedMemory -= GetMemoryChunkSpace(eo->list);
......@@ -53,29 +57,57 @@ ginAppendData(void *old, void *new, void *arg)
eo->list[eo->number] = en->list[0];
eo->number++;
return old;
}
/* Comparator function for rbtree.c */
static int
cmpEntryAccumulator(const void *a, const void *b, void *arg)
cmpEntryAccumulator(const RBNode *a, const RBNode *b, void *arg)
{
EntryAccumulator *ea = (EntryAccumulator *) a;
EntryAccumulator *eb = (EntryAccumulator *) b;
const EntryAccumulator *ea = (const EntryAccumulator *) a;
const EntryAccumulator *eb = (const EntryAccumulator *) b;
BuildAccumulator *accum = (BuildAccumulator *) arg;
return compareAttEntries(accum->ginstate, ea->attnum, ea->value,
eb->attnum, eb->value);
}
/* Allocator function for rbtree.c */
static RBNode *
ginAllocEntryAccumulator(void *arg)
{
BuildAccumulator *accum = (BuildAccumulator *) arg;
EntryAccumulator *ea;
/*
* Allocate memory by rather big chunks to decrease overhead. We have
* no need to reclaim RBNodes individually, so this costs nothing.
*/
if (accum->entryallocator == NULL || accum->length >= DEF_NENTRY)
{
accum->entryallocator = palloc(sizeof(EntryAccumulator) * DEF_NENTRY);
accum->allocatedMemory += GetMemoryChunkSpace(accum->entryallocator);
accum->length = 0;
}
/* Allocate new RBNode from current chunk */
ea = accum->entryallocator + accum->length;
accum->length++;
return (RBNode *) ea;
}
void
ginInitBA(BuildAccumulator *accum)
{
accum->allocatedMemory = 0;
accum->length = 0;
accum->entryallocator = NULL;
accum->tree = rb_create(cmpEntryAccumulator, ginAppendData, NULL, accum);
accum->iterator = NULL;
accum->tmpList = NULL;
accum->tree = rb_create(sizeof(EntryAccumulator),
cmpEntryAccumulator,
ginCombineData,
ginAllocEntryAccumulator,
NULL, /* no freefunc needed */
(void *) accum);
}
/*
......@@ -104,55 +136,41 @@ getDatumCopy(BuildAccumulator *accum, OffsetNumber attnum, Datum value)
static void
ginInsertEntry(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber attnum, Datum entry)
{
EntryAccumulator *key,
*ea;
EntryAccumulator key;
EntryAccumulator *ea;
bool isNew;
/*
* Allocate memory by rather big chunk to decrease overhead, we don't keep
* pointer to previously allocated chunks because they will free by
* MemoryContextReset() call.
* For the moment, fill only the fields of key that will be looked at
* by cmpEntryAccumulator or ginCombineData.
*/
if (accum->entryallocator == NULL || accum->length >= DEF_NENTRY)
{
accum->entryallocator = palloc(sizeof(EntryAccumulator) * DEF_NENTRY);
accum->allocatedMemory += GetMemoryChunkSpace(accum->entryallocator);
accum->length = 0;
}
key.attnum = attnum;
key.value = entry;
/* temporarily set up single-entry itempointer list */
key.list = heapptr;
/* "Allocate" new key in chunk */
key = accum->entryallocator + accum->length;
accum->length++;
ea = (EntryAccumulator *) rb_insert(accum->tree, (RBNode *) &key, &isNew);
key->attnum = attnum;
key->value = entry;
/* To prevent multiple palloc/pfree cycles, we reuse array */
if (accum->tmpList == NULL)
accum->tmpList =
(ItemPointerData *) palloc(sizeof(ItemPointerData) * DEF_NPTR);
key->list = accum->tmpList;
key->list[0] = *heapptr;
ea = rb_insert(accum->tree, key);
if (ea == NULL)
if (isNew)
{
/*
* The key has been inserted, so continue initialization.
* Finish initializing new tree entry, including making permanent
* copies of the datum and itempointer.
*/
key->value = getDatumCopy(accum, attnum, entry);
key->length = DEF_NPTR;
key->number = 1;
key->shouldSort = FALSE;
accum->allocatedMemory += GetMemoryChunkSpace(key->list);
accum->tmpList = NULL;
ea->value = getDatumCopy(accum, attnum, entry);
ea->length = DEF_NPTR;
ea->number = 1;
ea->shouldSort = FALSE;
ea->list =
(ItemPointerData *) palloc(sizeof(ItemPointerData) * DEF_NPTR);
ea->list[0] = *heapptr;
accum->allocatedMemory += GetMemoryChunkSpace(ea->list);
}
else
{
/*
* The key has been appended, so "free" allocated key by decrementing
* chunk's counter.
* ginCombineData did everything needed.
*/
accum->length--;
}
}
......@@ -214,16 +232,20 @@ qsortCompareItemPointers(const void *a, const void *b)
return res;
}
/* Prepare to read out the rbtree contents using ginGetEntry */
void
ginBeginBAScan(BuildAccumulator *accum)
{
rb_begin_iterate(accum->tree, LeftRightWalk);
}
ItemPointerData *
ginGetEntry(BuildAccumulator *accum, OffsetNumber *attnum, Datum *value, uint32 *n)
{
EntryAccumulator *entry;
ItemPointerData *list;
if (accum->iterator == NULL)
accum->iterator = rb_begin_iterate(accum->tree, LeftRightWalk);
entry = rb_iterate(accum->iterator);
entry = (EntryAccumulator *) rb_iterate(accum->tree);
if (entry == NULL)
return NULL;
......
......@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/gin/ginentrypage.c,v 1.24 2010/02/26 02:00:33 momjian Exp $
* $PostgreSQL: pgsql/src/backend/access/gin/ginentrypage.c,v 1.25 2010/08/01 02:12:42 tgl Exp $
*-------------------------------------------------------------------------
*/
......@@ -615,7 +615,7 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR
}
/*
* return newly allocate rightmost tuple
* return newly allocated rightmost tuple
*/
IndexTuple
ginPageGetLinkItup(Buffer buf)
......@@ -646,10 +646,12 @@ entryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
itup = ginPageGetLinkItup(lbuf);
if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index root page");
pfree(itup);
itup = ginPageGetLinkItup(rbuf);
if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index root page");
pfree(itup);
}
void
......
......@@ -11,7 +11,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/gin/ginfast.c,v 1.7 2010/02/11 14:29:50 teodor Exp $
* $PostgreSQL: pgsql/src/backend/access/gin/ginfast.c,v 1.8 2010/08/01 02:12:42 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -786,6 +786,7 @@ ginInsertCleanup(Relation index, GinState *ginstate,
* significant amount of time - so, run it without locking pending
* list.
*/
ginBeginBAScan(&accum);
while ((list = ginGetEntry(&accum, &attnum, &entry, &nlist)) != NULL)
{
ginEntryInsert(index, ginstate, attnum, entry, list, nlist, FALSE);
......@@ -820,6 +821,7 @@ ginInsertCleanup(Relation index, GinState *ginstate,
ginInitBA(&accum);
processPendingPage(&accum, &datums, page, maxoff + 1);
ginBeginBAScan(&accum);
while ((list = ginGetEntry(&accum, &attnum, &entry, &nlist)) != NULL)
ginEntryInsert(index, ginstate, attnum, entry, list, nlist, FALSE);
}
......
......@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/gin/gininsert.c,v 1.26 2010/02/11 14:29:50 teodor Exp $
* $PostgreSQL: pgsql/src/backend/access/gin/gininsert.c,v 1.27 2010/08/01 02:12:42 tgl Exp $
*-------------------------------------------------------------------------
*/
......@@ -176,6 +176,7 @@ ginEntryInsert(Relation index, GinState *ginstate,
gdi = prepareScanPostingTree(index, rootPostingTree, FALSE);
gdi->btree.isBuild = isBuild;
insertItemPointer(gdi, items, nitem);
pfree(gdi);
return;
}
......@@ -254,6 +255,7 @@ ginBuildCallback(Relation index, HeapTuple htup, Datum *values,
uint32 nlist;
OffsetNumber attnum;
ginBeginBAScan(&buildstate->accum);
while ((list = ginGetEntry(&buildstate->accum, &attnum, &entry, &nlist)) != NULL)
{
/* there could be many entries, so be willing to abort here */
......@@ -360,6 +362,7 @@ ginbuild(PG_FUNCTION_ARGS)
/* dump remaining entries to the index */
oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx);
ginBeginBAScan(&buildstate.accum);
while ((list = ginGetEntry(&buildstate.accum, &attnum, &entry, &nlist)) != NULL)
{
/* there could be many entries, so be willing to abort here */
......
This diff is collapsed.
......@@ -4,7 +4,7 @@
*
* Copyright (c) 2006-2010, PostgreSQL Global Development Group
*
* $PostgreSQL: pgsql/src/include/access/gin.h,v 1.39 2010/07/31 00:30:54 tgl Exp $
* $PostgreSQL: pgsql/src/include/access/gin.h,v 1.40 2010/08/01 02:12:42 tgl Exp $
*--------------------------------------------------------------------------
*/
#ifndef GIN_H
......@@ -565,6 +565,7 @@ extern Datum ginarrayconsistent(PG_FUNCTION_ARGS);
/* ginbulk.c */
typedef struct EntryAccumulator
{
RBNode rbnode;
Datum value;
uint32 length;
uint32 number;
......@@ -579,15 +580,14 @@ typedef struct
long allocatedMemory;
uint32 length;
EntryAccumulator *entryallocator;
ItemPointerData *tmpList;
RBTree *tree;
RBTreeIterator *iterator;
} BuildAccumulator;
extern void ginInitBA(BuildAccumulator *accum);
extern void ginInsertRecordBA(BuildAccumulator *accum,
ItemPointer heapptr,
OffsetNumber attnum, Datum *entries, int32 nentry);
extern void ginBeginBAScan(BuildAccumulator *accum);
extern ItemPointerData *ginGetEntry(BuildAccumulator *accum, OffsetNumber *attnum, Datum *entry, uint32 *n);
/* ginfast.c */
......
......@@ -3,44 +3,64 @@
* rbtree.h
* interface for PostgreSQL generic Red-Black binary tree package
*
* Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Copyright (c) 2009-2010, PostgreSQL Global Development Group
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/include/utils/rbtree.h,v 1.3 2010/05/11 18:14:01 rhaas Exp $
* $PostgreSQL: pgsql/src/include/utils/rbtree.h,v 1.4 2010/08/01 02:12:42 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef RBTREE_H
#define RBTREE_H
/*
* RBNode is intended to be used as the first field of a larger struct,
* whose additional fields carry whatever payload data the caller needs
* for a tree entry. (The total size of that larger struct is passed to
* rb_create.) RBNode is declared here to support this usage, but
* callers must treat it as an opaque struct.
*/
typedef struct RBNode
{
char iteratorState; /* workspace for iterating through tree */
char color; /* node's current color, red or black */
struct RBNode *left; /* left child, or RBNIL if none */
struct RBNode *right; /* right child, or RBNIL if none */
struct RBNode *parent; /* parent, or NULL (not RBNIL!) if none */
} RBNode;
/* Opaque struct representing a whole tree */
typedef struct RBTree RBTree;
typedef struct RBTreeIterator RBTreeIterator;
typedef int (*rb_comparator) (const void *a, const void *b, void *arg);
typedef void *(*rb_appendator) (void *currentdata, void *newval, void *arg);
typedef void (*rb_freefunc) (void *a);
/* Available tree iteration orderings */
typedef enum RBOrderControl
{
LeftRightWalk, /* inorder: left child, node, right child */
RightLeftWalk, /* reverse inorder: right, node, left */
DirectWalk, /* preorder: node, left child, right child */
InvertedWalk /* postorder: left child, right child, node */
} RBOrderControl;
/* Support functions to be provided by caller */
typedef int (*rb_comparator) (const RBNode *a, const RBNode *b, void *arg);
typedef void (*rb_combiner) (RBNode *existing, const RBNode *newdata, void *arg);
typedef RBNode *(*rb_allocfunc) (void *arg);
typedef void (*rb_freefunc) (RBNode *x, void *arg);
extern RBTree *rb_create(rb_comparator comparator,
rb_appendator appendator,
extern RBTree *rb_create(Size node_size,
rb_comparator comparator,
rb_combiner combiner,
rb_allocfunc allocfunc,
rb_freefunc freefunc,
void *arg);
extern void *rb_find(RBTree *rb, void *data);
extern void *rb_insert(RBTree *rb, void *data);
extern void rb_delete(RBTree *rb, void *data);
extern void *rb_leftmost(RBTree *rb);
extern RBNode *rb_find(RBTree *rb, const RBNode *data);
extern RBNode *rb_leftmost(RBTree *rb);
typedef enum RBOrderControl
{
LeftRightWalk,
RightLeftWalk,
DirectWalk,
InvertedWalk
} RBOrderControl;
extern RBNode *rb_insert(RBTree *rb, const RBNode *data, bool *isNew);
extern void rb_delete(RBTree *rb, RBNode *node);
extern RBTreeIterator *rb_begin_iterate(RBTree *rb, RBOrderControl ctrl);
extern void *rb_iterate(RBTreeIterator *iterator);
extern void rb_free_iterator(RBTreeIterator *iterator);
extern void rb_begin_iterate(RBTree *rb, RBOrderControl ctrl);
extern RBNode *rb_iterate(RBTree *rb);
#endif
#endif /* RBTREE_H */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment