Commit 26c48b5e authored by Tom Lane's avatar Tom Lane

Final stage of psort reconstruction work: replace psort.c with

a generalized module 'tuplesort.c' that can sort either HeapTuples or
IndexTuples, and is not tied to execution of a Sort node.  Clean up
memory leakages in sorting, and replace nbtsort.c's private implementation
of mergesorting with calls to tuplesort.c.
parent 59ed74e6
/*-------------------------------------------------------------------------
*
* btree.c
* nbtree.c
* Implementation of Lehman and Yao's btree management algorithm for
* Postgres.
*
* Copyright (c) 1994, Regents of the University of California
* NOTES
* This file contains only the public interface routines.
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.46 1999/09/18 19:06:10 tgl Exp $
* Copyright (c) 1994, Regents of the University of California
*
* NOTES
* This file contains only the public interface routines.
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.47 1999/10/17 22:15:03 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -76,7 +76,7 @@ btbuild(Relation heap,
#endif
Node *pred,
*oldPred;
void *spool = (void *) NULL;
BTSpool *spool = NULL;
bool isunique;
bool usefast;
......@@ -147,7 +147,7 @@ btbuild(Relation heap,
if (usefast)
{
spool = _bt_spoolinit(index, 7, isunique);
spool = _bt_spoolinit(index, isunique);
res = (InsertIndexResult) NULL;
}
......@@ -249,11 +249,11 @@ btbuild(Relation heap,
/*
* if we are doing bottom-up btree build, we insert the index into
* a spool page for subsequent processing. otherwise, we insert
* a spool file for subsequent processing. otherwise, we insert
* into the btree.
*/
if (usefast)
_bt_spool(index, btitem, spool);
_bt_spool(btitem, spool);
else
res = _bt_doinsert(index, btitem, isunique, heap);
......@@ -275,15 +275,13 @@ btbuild(Relation heap,
}
/*
* if we are doing bottom-up btree build, we now have a bunch of
* sorted runs in the spool pages. finish the build by (1) merging
* the runs, (2) inserting the sorted tuples into btree pages and (3)
* building the upper levels.
* if we are doing bottom-up btree build, finish the build by
* (1) completing the sort of the spool file, (2) inserting the
* sorted tuples into btree pages and (3) building the upper levels.
*/
if (usefast)
{
_bt_spool(index, (BTItem) NULL, spool); /* flush the spool */
_bt_leafbuild(index, spool);
_bt_leafbuild(spool);
_bt_spooldestroy(spool);
}
......
/*-------------------------------------------------------------------------
* btsort.c
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Id: nbtsort.c,v 1.46 1999/07/19 07:07:19 momjian Exp $
* nbtsort.c
* Build a btree from sorted input by loading leaf pages sequentially.
*
* NOTES
*
* what we do is:
* - generate a set of initial one-block runs, distributed round-robin
* between the output tapes.
* - for each pass,
* - swap input and output tape sets, rewinding both and truncating
* the output tapes.
* - merge the current run in each input tape to the current output
* tape.
* - when each input run has been exhausted, switch to another output
* tape and start processing another run.
* - when we have fewer runs than tapes, we know we are ready to start
* merging into the btree leaf pages. (i.e., we do not have to wait
* until we have exactly one tape.)
* - as we extract tuples from the final runs, we build the pages for
* each level. when we have only one page on a level, it must be the
* root -- it can be attached to the btree metapage and we are done.
*
* conventions:
* - external interface routines take in and return "void *" for their
* opaque handles. this is for modularity reasons.
* We use tuplesort.c to sort the given index tuples into order.
* Then we scan the index tuples in order and build the btree pages
* for each level. When we have only one page on a level, it must be the
* root -- it can be attached to the btree metapage and we are done.
*
* this code is moderately slow (~10% slower) compared to the regular
* btree (insertion) build code on sorted or well-clustered data. on
* random data, however, the insertion build code is unusable -- the
* difference on a 60MB heap is a factor of 15 because the random
* probes into the btree thrash the buffer pool.
* probes into the btree thrash the buffer pool. (NOTE: the above
* "10%" estimate is probably obsolete, since it refers to an old and
* not very good external sort implementation that used to exist in
* this module. tuplesort.c is almost certainly faster.)
*
* this code currently packs the pages to 100% of capacity. this is
* not wise, since *any* insertion will cause splitting. filling to
* something like the standard 70% steady-state load factor for btrees
* would probably be better.
*
* somebody desperately needs to figure out how to do a better job of
* balancing the merge passes -- the fan-in on the final merges can be
* pretty poor, which is bad for performance.
*
* Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.47 1999/10/17 22:15:04 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include <fcntl.h>
#include "postgres.h"
#include "access/nbtree.h"
#include "utils/tuplesort.h"
#ifdef BTREE_BUILD_STATS
#define ShowExecutorStats pg_options[TRACE_EXECUTORSTATS]
#endif
static BTItem _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags);
static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend);
static void *_bt_pagestate(Relation index, int flags, int level, bool doupper);
static void _bt_uppershutdown(Relation index, BTPageState *state);
/*
* turn on debugging output.
*
......@@ -70,689 +49,108 @@ static void _bt_uppershutdown(Relation index, BTPageState *state);
* only really useful for integer keys.
*/
/*#define FASTBUILD_DEBUG*/
#define FASTBUILD_SPOOL
#define FASTBUILD_MERGE
#define MAXTAPES (7)
#define TAPEBLCKSZ (BLCKSZ << 2)
extern int NDirectFileRead;
extern int NDirectFileWrite;
/*
* this is what we use to shovel BTItems in and out of memory. it's
* bigger than a standard block because we are doing a lot of strictly
* sequential i/o. this is obviously something of a tradeoff since we
* are potentially reading a bunch of zeroes off of disk in many
* cases.
*
* BTItems are packed in and MAXALIGN'd.
*
* the fd should not be going out to disk, strictly speaking, but it's
* the only thing like that so i'm not going to worry about wasting a
* few bytes.
*/
typedef struct
{
int bttb_magic; /* magic number */
File bttb_fd; /* file descriptor */
int bttb_top; /* top of free space within bttb_data */
short bttb_ntup; /* number of tuples in this block */
short bttb_eor; /* End-Of-Run marker */
char bttb_data[TAPEBLCKSZ - 2 * sizeof(double)];
} BTTapeBlock;
/*
* this structure holds the bookkeeping for a simple balanced multiway
* merge. (polyphase merging is hairier than i want to get into right
* now, and i don't see why i have to care how many "tapes" i use
* right now. though if psort was in a condition that i could hack it
* to do this, you bet i would.)
* Status record for spooling.
*/
typedef struct
struct BTSpool
{
int bts_ntapes;
int bts_tape;
BTTapeBlock **bts_itape; /* input tape blocks */
BTTapeBlock **bts_otape; /* output tape blocks */
Tuplesortstate *sortstate; /* state data for tuplesort.c */
Relation index;
bool isunique;
} BTSpool;
/*-------------------------------------------------------------------------
* sorting comparison routine - returns {-1,0,1} depending on whether
* the key in the left BTItem is {<,=,>} the key in the right BTItem.
*
* we want to use _bt_isortcmp as a comparison function for qsort(3),
* but it needs extra arguments, so we "pass them in" as global
* variables. ick. fortunately, they are the same throughout the
* build, so we need do this only once. this is why you must call
* _bt_isortcmpinit before the call to qsort(3).
*
* a NULL BTItem is always assumed to be greater than any actual
* value; our heap routines (see below) assume that the smallest
* element in the heap is returned. that way, NULL values from the
* exhausted tapes can sift down to the bottom of the heap. in point
* of fact we just don't replace the elements of exhausted tapes, but
* what the heck.
* *-------------------------------------------------------------------------
*/
typedef struct
{
Datum *btsk_datum;
char *btsk_nulls;
BTItem btsk_item;
} BTSortKey;
static Relation _bt_sortrel;
static int _bt_nattr;
static BTSpool *_bt_inspool;
static void
_bt_isortcmpinit(Relation index, BTSpool *spool)
{
_bt_sortrel = index;
_bt_inspool = spool;
_bt_nattr = index->rd_att->natts;
}
static int
_bt_isortcmp(BTSortKey *k1, BTSortKey *k2)
{
Datum *k1_datum = k1->btsk_datum;
Datum *k2_datum = k2->btsk_datum;
char *k1_nulls = k1->btsk_nulls;
char *k2_nulls = k2->btsk_nulls;
bool equal_isnull = false;
int i;
if (k1->btsk_item == (BTItem) NULL)
{
if (k2->btsk_item == (BTItem) NULL)
return 0; /* 1 = 2 */
return 1; /* 1 > 2 */
}
else if (k2->btsk_item == (BTItem) NULL)
return -1; /* 1 < 2 */
for (i = 0; i < _bt_nattr; i++)
{
if (k1_nulls[i] != ' ') /* k1 attr is NULL */
{
if (k2_nulls[i] != ' ') /* the same for k2 */
{
equal_isnull = true;
continue;
}
return 1; /* NULL ">" NOT_NULL */
}
else if (k2_nulls[i] != ' ') /* k2 attr is NULL */
return -1; /* NOT_NULL "<" NULL */
if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber,
k1_datum[i], k2_datum[i]))
return 1; /* 1 > 2 */
else if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber,
k2_datum[i], k1_datum[i]))
return -1; /* 1 < 2 */
}
if (_bt_inspool->isunique && !equal_isnull)
{
_bt_spooldestroy((void *) _bt_inspool);
elog(ERROR, "Cannot create unique index. Table contains non-unique values");
}
return 0; /* 1 = 2 */
}
static void
_bt_setsortkey(Relation index, BTItem bti, BTSortKey *sk)
{
sk->btsk_item = (BTItem) NULL;
sk->btsk_datum = (Datum *) NULL;
sk->btsk_nulls = (char *) NULL;
if (bti != (BTItem) NULL)
{
IndexTuple it = &(bti->bti_itup);
TupleDesc itdesc = index->rd_att;
Datum *dp = (Datum *) palloc(_bt_nattr * sizeof(Datum));
char *np = (char *) palloc(_bt_nattr * sizeof(char));
bool isnull;
int i;
for (i = 0; i < _bt_nattr; i++)
{
dp[i] = index_getattr(it, i + 1, itdesc, &isnull);
if (isnull)
np[i] = 'n';
else
np[i] = ' ';
}
sk->btsk_item = bti;
sk->btsk_datum = dp;
sk->btsk_nulls = np;
}
}
/*-------------------------------------------------------------------------
* priority queue methods
*
* these were more-or-less lifted from the heap section of the 1984
* edition of gonnet's book on algorithms and data structures. they
* are coded so that the smallest element in the heap is returned (we
* use them for merging sorted runs).
*
* XXX these probably ought to be generic library functions.
*-------------------------------------------------------------------------
*/
typedef struct
{
int btpqe_tape; /* tape identifier */
BTSortKey btpqe_item; /* pointer to BTItem in tape buffer */
} BTPriQueueElem;
#define MAXELEM MAXTAPES
typedef struct
{
int btpq_nelem;
BTPriQueueElem btpq_queue[MAXELEM];
Relation btpq_rel;
} BTPriQueue;
/* be sure to call _bt_isortcmpinit first */
#define GREATER(a, b) \
(_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0)
static void
_bt_pqsift(BTPriQueue *q, int parent)
{
int child;
BTPriQueueElem e;
for (child = parent * 2 + 1;
child < q->btpq_nelem;
child = parent * 2 + 1)
{
if (child < q->btpq_nelem - 1)
{
if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child + 1])))
++child;
}
if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child])))
{
e = q->btpq_queue[child]; /* struct = */
q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */
q->btpq_queue[parent] = e; /* struct = */
parent = child;
}
else
parent = child + 1;
}
}
static int
_bt_pqnext(BTPriQueue *q, BTPriQueueElem *e)
{
if (q->btpq_nelem < 1)
{ /* already empty */
return -1;
}
*e = q->btpq_queue[0]; /* struct = */
if (--q->btpq_nelem < 1)
{ /* now empty, don't sift */
return 0;
}
q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem]; /* struct = */
_bt_pqsift(q, 0);
return 0;
}
static void
_bt_pqadd(BTPriQueue *q, BTPriQueueElem *e)
{
int child,
parent;
if (q->btpq_nelem >= MAXELEM)
elog(ERROR, "_bt_pqadd: queue overflow");
child = q->btpq_nelem++;
while (child > 0)
{
parent = child / 2;
if (GREATER(e, &(q->btpq_queue[parent])))
break;
else
{
q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */
child = parent;
}
}
q->btpq_queue[child] = *e; /* struct = */
}
/*-------------------------------------------------------------------------
* tape methods
*-------------------------------------------------------------------------
*/
};
#define BTITEMSZ(btitem) \
((btitem) ? \
(IndexTupleDSize((btitem)->bti_itup) + \
(sizeof(BTItemData) - sizeof(IndexTupleData))) : \
0)
#define SPCLEFT(tape) \
(sizeof((tape)->bttb_data) - (tape)->bttb_top)
#define EMPTYTAPE(tape) \
((tape)->bttb_ntup <= 0)
#define BTTAPEMAGIC 0x19660226
/*
* reset the tape header for its next use without doing anything to
* the physical tape file. (setting bttb_top to 0 makes the block
* empty.)
*/
static void
_bt_tapereset(BTTapeBlock *tape)
{
tape->bttb_eor = 0;
tape->bttb_top = 0;
tape->bttb_ntup = 0;
}
/*
* rewind the physical tape file.
*/
static void
_bt_taperewind(BTTapeBlock *tape)
{
FileSeek(tape->bttb_fd, 0L, SEEK_SET);
}
/*
* destroy the contents of the physical tape file without destroying
* the tape data structure or removing the physical tape file.
*
* we use the VFD version of ftruncate(2) to do this rather than
* unlinking and recreating the file. you still have to wait while
* the OS frees up all of the file system blocks and stuff, but at
* least you don't have to delete and reinsert the directory entries.
*/
static void
_bt_tapeclear(BTTapeBlock *tape)
{
/* blow away the contents of the old file */
_bt_taperewind(tape);
#ifdef NOT_USED
FileSync(tape->bttb_fd);
#endif
FileTruncate(tape->bttb_fd, 0);
/* reset the buffer */
_bt_tapereset(tape);
}
/*
* create a new BTTapeBlock, allocating memory for the data structure
* as well as opening a physical tape file.
*/
static BTTapeBlock *
_bt_tapecreate(void)
{
BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock));
if (tape == (BTTapeBlock *) NULL)
elog(ERROR, "_bt_tapecreate: out of memory");
tape->bttb_magic = BTTAPEMAGIC;
tape->bttb_fd = OpenTemporaryFile();
Assert(tape->bttb_fd >= 0);
/* initialize the buffer */
_bt_tapereset(tape);
return tape;
}
/*
* destroy the BTTapeBlock structure and its physical tape file.
*/
static void
_bt_tapedestroy(BTTapeBlock *tape)
{
FileUnlink(tape->bttb_fd);
pfree((void *) tape);
}
/*
* flush the tape block to the file, marking End-Of-Run if requested.
*/
static void
_bt_tapewrite(BTTapeBlock *tape, int eor)
{
tape->bttb_eor = eor;
FileWrite(tape->bttb_fd, (char *) tape, TAPEBLCKSZ);
NDirectFileWrite += TAPEBLCKSZ / BLCKSZ;
_bt_tapereset(tape);
}
/*
* read a tape block from the file, overwriting the current contents
* of the buffer.
*
* returns:
* - 0 if there are no more blocks in the tape or in this run (call
* _bt_tapereset to clear the End-Of-Run marker)
* - 1 if a valid block was read
*/
static int
_bt_taperead(BTTapeBlock *tape)
{
File fd;
int nread;
if (tape->bttb_eor)
{
return 0; /* we are already at End-Of-Run */
}
/*
* we're clobbering the old tape block, but we do need to save the VFD
* (the one in the block we're reading is bogus).
*/
fd = tape->bttb_fd;
nread = FileRead(fd, (char *) tape, TAPEBLCKSZ);
tape->bttb_fd = fd;
if (nread != TAPEBLCKSZ)
{
Assert(nread == 0); /* we are at EOF */
return 0;
}
Assert(tape->bttb_magic == BTTAPEMAGIC);
NDirectFileRead += TAPEBLCKSZ / BLCKSZ;
return 1;
}
/*
* get the next BTItem from a tape block.
*
* returns:
* - NULL if we have run out of BTItems
* - a pointer to the BTItemData in the block otherwise
*
* side effects:
* - sets 'pos' to the current position within the block.
*/
static BTItem
_bt_tapenext(BTTapeBlock *tape, char **pos)
{
Size itemsz;
BTItem bti;
static void _bt_load(Relation index, BTSpool *btspool);
static BTItem _bt_buildadd(Relation index, BTPageState *state, BTItem bti,
int flags);
static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend);
static BTPageState *_bt_pagestate(Relation index, int flags,
int level, bool doupper);
static void _bt_uppershutdown(Relation index, BTPageState *state);
if (*pos >= tape->bttb_data + tape->bttb_top)
return (BTItem) NULL;
bti = (BTItem) *pos;
itemsz = BTITEMSZ(bti);
*pos += MAXALIGN(itemsz);
return bti;
}
/*
* copy a BTItem into a tape block.
*
* assumes that we have already checked to see if the block has enough
* space for the item.
*
* side effects:
*
* - advances the 'top' pointer in the tape block header to point to
* the beginning of free space.
* Interface routines
*/
static void
_bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz)
{
memcpy(tape->bttb_data + tape->bttb_top, item, itemsz);
++tape->bttb_ntup;
tape->bttb_top += MAXALIGN(itemsz);
}
/*-------------------------------------------------------------------------
* spool methods
*-------------------------------------------------------------------------
*/
/*
* create and initialize a spool structure, including the underlying
* files.
* create and initialize a spool structure
*/
void *
_bt_spoolinit(Relation index, int ntapes, bool isunique)
BTSpool *
_bt_spoolinit(Relation index, bool isunique)
{
BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool));
int i;
if (btspool == (BTSpool *) NULL)
elog(ERROR, "_bt_spoolinit: out of memory");
MemSet((char *) btspool, 0, sizeof(BTSpool));
btspool->bts_ntapes = ntapes;
btspool->bts_tape = 0;
btspool->isunique = isunique;
btspool->bts_itape = (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
btspool->bts_otape = (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
if (btspool->bts_itape == (BTTapeBlock **) NULL ||
btspool->bts_otape == (BTTapeBlock **) NULL)
elog(ERROR, "_bt_spoolinit: out of memory");
btspool->index = index;
btspool->isunique = isunique;
for (i = 0; i < ntapes; ++i)
{
btspool->bts_itape[i] = _bt_tapecreate();
btspool->bts_otape[i] = _bt_tapecreate();
}
btspool->sortstate = tuplesort_begin_index(index, isunique, false);
_bt_isortcmpinit(index, btspool);
/*
* Currently, tuplesort provides sort functions on IndexTuples.
* If we kept anything in a BTItem other than a regular IndexTuple,
* we'd need to modify tuplesort to understand BTItems as such.
*/
Assert(sizeof(BTItemData) == sizeof(IndexTupleData));
return (void *) btspool;
return btspool;
}
/*
* clean up a spool structure and its substructures.
*/
void
_bt_spooldestroy(void *spool)
_bt_spooldestroy(BTSpool *btspool)
{
BTSpool *btspool = (BTSpool *) spool;
int i;
for (i = 0; i < btspool->bts_ntapes; ++i)
{
_bt_tapedestroy(btspool->bts_otape[i]);
_bt_tapedestroy(btspool->bts_itape[i]);
}
tuplesort_end(btspool->sortstate);
pfree((void *) btspool);
}
/*
* flush out any dirty output tape blocks
* spool a btitem into the sort file.
*/
static void
_bt_spoolflush(BTSpool *btspool)
void
_bt_spool(BTItem btitem, BTSpool *btspool)
{
int i;
for (i = 0; i < btspool->bts_ntapes; ++i)
{
if (!EMPTYTAPE(btspool->bts_otape[i]))
_bt_tapewrite(btspool->bts_otape[i], 1);
}
/* A BTItem is really just an IndexTuple */
tuplesort_puttuple(btspool->sortstate, (void *) btitem);
}
/*
* swap input tapes and output tapes by swapping their file
* descriptors. additional preparation for the next merge pass
* includes rewinding the new input tapes and clearing out the new
* output tapes.
* given a spool loaded by successive calls to _bt_spool,
* create an entire btree.
*/
static void
_bt_spoolswap(BTSpool *btspool)
void
_bt_leafbuild(BTSpool *btspool)
{
File tmpfd;
BTTapeBlock *itape;
BTTapeBlock *otape;
int i;
for (i = 0; i < btspool->bts_ntapes; ++i)
#ifdef BTREE_BUILD_STATS
if (ShowExecutorStats)
{
itape = btspool->bts_itape[i];
otape = btspool->bts_otape[i];
/*
* swap the input and output VFDs.
*/
tmpfd = itape->bttb_fd;
itape->bttb_fd = otape->bttb_fd;
otape->bttb_fd = tmpfd;
/*
* rewind the new input tape.
*/
_bt_taperewind(itape);
_bt_tapereset(itape);
/*
* clear the new output tape -- it's ok to throw away the old
* inputs.
*/
_bt_tapeclear(otape);
fprintf(stderr, "! BtreeBuild (Spool) Stats:\n");
ShowUsage();
ResetUsage();
}
#endif
tuplesort_performsort(btspool->sortstate);
_bt_load(btspool->index, btspool);
}
/*-------------------------------------------------------------------------
* sorting routines
*-------------------------------------------------------------------------
*/
/*
* spool 'btitem' into an initial run. as tape blocks are filled, the
* block BTItems are qsorted and written into some output tape (it
* doesn't matter which; we go round-robin for simplicity). the
* initial runs are therefore always just one block.
* Internal routines.
*/
void
_bt_spool(Relation index, BTItem btitem, void *spool)
{
BTSpool *btspool = (BTSpool *) spool;
BTTapeBlock *itape;
Size itemsz;
_bt_isortcmpinit(index, btspool);
itape = btspool->bts_itape[btspool->bts_tape];
itemsz = BTITEMSZ(btitem);
itemsz = MAXALIGN(itemsz);
/*
* if this buffer is too full for this BTItemData, or if we have run
* out of BTItems, we need to sort the buffer and write it out. in
* this case, the BTItemData will go into the next tape's buffer.
*/
if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz)
{
BTSortKey *parray = (BTSortKey *) NULL;
BTTapeBlock *otape;
BTItem bti;
char *pos;
int btisz;
int it_ntup = itape->bttb_ntup;
int i;
/*
* build an array of pointers to the BTItemDatas on the input
* block.
*/
if (it_ntup > 0)
{
parray = (BTSortKey *) palloc(it_ntup * sizeof(BTSortKey));
pos = itape->bttb_data;
for (i = 0; i < it_ntup; ++i)
_bt_setsortkey(index, _bt_tapenext(itape, &pos), &(parray[i]));
/*
* qsort the pointer array.
*/
qsort((void *) parray, it_ntup, sizeof(BTSortKey),
(int (*) (const void *, const void *)) _bt_isortcmp);
}
/*
* write the spooled run into the output tape. we copy the
* BTItemDatas in the order dictated by the sorted array of
* BTItems, not the original order.
*
* (since everything was MAXALIGN'd and is all on a single tape
* block, everything had *better* still fit on one tape block..)
*/
otape = btspool->bts_otape[btspool->bts_tape];
for (i = 0; i < it_ntup; ++i)
{
bti = parray[i].btsk_item;
btisz = BTITEMSZ(bti);
btisz = MAXALIGN(btisz);
_bt_tapeadd(otape, bti, btisz);
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_SPOOL)
{
bool isnull;
Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att,
&isnull);
printf("_bt_spool: inserted <%x> into output tape %d\n",
d, btspool->bts_tape);
}
#endif /* FASTBUILD_DEBUG && FASTBUILD_SPOOL */
}
/*
* the initial runs are always single tape blocks. flush the
* output block, marking End-Of-Run.
*/
_bt_tapewrite(otape, 1);
/*
* reset the input buffer for the next run. we don't have to
* write it out or anything -- we only use it to hold the unsorted
* BTItemDatas, the output tape contains all the sorted stuff.
*
* changing bts_tape changes the output tape and input tape; we
* change itape for the code below.
*/
_bt_tapereset(itape);
btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
itape = btspool->bts_itape[btspool->bts_tape];
/*
* destroy the pointer array.
*/
if (parray != (BTSortKey *) NULL)
{
for (i = 0; i < it_ntup; i++)
{
if (parray[i].btsk_datum != (Datum *) NULL)
pfree((void *) (parray[i].btsk_datum));
if (parray[i].btsk_nulls != (char *) NULL)
pfree((void *) (parray[i].btsk_nulls));
}
pfree((void *) parray);
}
}
/* insert this item into the current buffer */
if (btitem != (BTItem) NULL)
_bt_tapeadd(itape, btitem, itemsz);
}
/*
* allocate a new, clean btree page, not linked to any siblings.
......@@ -805,7 +203,7 @@ _bt_slideleft(Relation index, Buffer buf, Page page)
* allocate and initialize a new BTPageState. the returned structure
* is suitable for immediate use by _bt_buildadd.
*/
static void *
static BTPageState *
_bt_pagestate(Relation index, int flags, int level, bool doupper)
{
BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState));
......@@ -819,7 +217,7 @@ _bt_pagestate(Relation index, int flags, int level, bool doupper)
state->btps_level = level;
state->btps_doupper = doupper;
return (void *) state;
return state;
}
/*
......@@ -883,9 +281,8 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend)
* if all keys are unique, 'first' will always be the same as 'last'.
*/
static BTItem
_bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
{
BTPageState *state = (BTPageState *) pstate;
Buffer nbuf;
Page npage;
BTItem last_bti;
......@@ -944,8 +341,7 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
if (PageAddItem(npage, PageGetItem(opage, ii),
ii->lp_len, n, LP_USED) == InvalidOffsetNumber)
elog(FATAL, "btree: failed to add item to the page in _bt_sort (1)");
#ifdef NOT_USED
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
#ifdef FASTBUILD_DEBUG
{
bool isnull;
BTItem tmpbti =
......@@ -956,7 +352,6 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
printf("_bt_buildadd: moved <%x> to offset %d at level %d\n",
d, n, state->btps_level);
}
#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
#endif
}
......@@ -989,7 +384,7 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
nopaque->btpo_prev = BufferGetBlockNumber(obuf);
nopaque->btpo_next = P_NONE;
if (_bt_itemcmp(index, _bt_nattr,
if (_bt_itemcmp(index, index->rd_att->natts,
(BTItem) PageGetItem(opage, PageGetItemId(opage, P_HIKEY)),
(BTItem) PageGetItem(opage, PageGetItemId(opage, P_FIRSTKEY)),
BTEqualStrategyNumber))
......@@ -1030,8 +425,7 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
off = OffsetNumberNext(last_off);
if (PageAddItem(npage, (Item) bti, btisz, off, LP_USED) == InvalidOffsetNumber)
elog(FATAL, "btree: failed to add item to the page in _bt_sort (2)");
#ifdef NOT_USED
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
#ifdef FASTBUILD_DEBUG
{
bool isnull;
Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull);
......@@ -1039,11 +433,10 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n",
d, off, state->btps_level);
}
#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
#endif
if (last_bti == (BTItem) NULL)
first_off = P_FIRSTKEY;
else if (!_bt_itemcmp(index, _bt_nattr,
else if (!_bt_itemcmp(index, index->rd_att->natts,
bti, last_bti, BTEqualStrategyNumber))
first_off = off;
last_off = off;
......@@ -1103,224 +496,31 @@ _bt_uppershutdown(Relation index, BTPageState *state)
}
/*
* take the input tapes stored by 'btspool' and perform successive
* merging passes until at most one run is left in each tape. at that
* point, merge the final tape runs into a set of btree leaves.
*
* XXX three nested loops? gross. cut me up into smaller routines.
* Read tuples in correct sort order from tuplesort, and load them into
* btree leaves.
*/
static void
_bt_merge(Relation index, BTSpool *btspool)
_bt_load(Relation index, BTSpool *btspool)
{
BTPageState *state;
BTPriQueue q;
BTPriQueueElem e;
BTSortKey btsk;
BTItem bti;
BTTapeBlock *itape;
BTTapeBlock *otape;
char *tapepos[MAXTAPES];
int tapedone[MAXTAPES];
int t;
int goodtapes;
int npass;
int nruns;
Size btisz;
bool doleaf = false;
bool should_free;
/*
* initialize state needed for the merge into the btree leaf pages.
*/
state = (BTPageState *) _bt_pagestate(index, BTP_LEAF, 0, true);
npass = 0;
do
{ /* pass */
/*
* each pass starts by flushing the previous outputs and swapping
* inputs and outputs. flushing sets End-of-Run for any dirty
* output tapes. swapping clears the new output tapes and rewinds
* the new input tapes.
*/
btspool->bts_tape = btspool->bts_ntapes - 1;
_bt_spoolflush(btspool);
_bt_spoolswap(btspool);
++npass;
nruns = 0;
for (;;)
{ /* run */
/*
* each run starts by selecting a new output tape. the merged
* results of a given run are always sent to this one tape.
*/
btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
otape = btspool->bts_otape[btspool->bts_tape];
/*
* initialize the priority queue by loading it with the first
* element of the given run in each tape. since we are
* starting a new run, we reset the tape (clearing the
* End-Of-Run marker) before reading it. this means that
* _bt_taperead will return 0 only if the tape is actually at
* EOF.
*/
MemSet((char *) &q, 0, sizeof(BTPriQueue));
goodtapes = 0;
for (t = 0; t < btspool->bts_ntapes; ++t)
{
itape = btspool->bts_itape[t];
tapepos[t] = itape->bttb_data;
tapedone[t] = 0;
_bt_tapereset(itape);
do
{
if (_bt_taperead(itape) == 0)
tapedone[t] = 1;
} while (!tapedone[t] && EMPTYTAPE(itape));
if (!tapedone[t])
{
++goodtapes;
e.btpqe_tape = t;
_bt_setsortkey(index, _bt_tapenext(itape, &tapepos[t]),
&(e.btpqe_item));
if (e.btpqe_item.btsk_item != (BTItem) NULL)
_bt_pqadd(&q, &e);
}
}
/*
* if we don't have any tapes with any input (i.e., they are
* all at EOF), there is no work to do in this run -- we must
* be done with this pass.
*/
if (goodtapes == 0)
{
break; /* for */
}
++nruns;
/*
* output the smallest element from the queue until there are
* no more.
*/
while (_bt_pqnext(&q, &e) >= 0)
{ /* item */
/*
* replace the element taken from priority queue, fetching
* a new block if needed. a tape can run out if it hits
* either End-Of-Run or EOF.
*/
t = e.btpqe_tape;
btsk = e.btpqe_item;
bti = btsk.btsk_item;
if (bti != (BTItem) NULL)
{
btisz = BTITEMSZ(bti);
btisz = MAXALIGN(btisz);
if (doleaf)
{
_bt_buildadd(index, state, bti, BTP_LEAF);
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
{
bool isnull;
Datum d = index_getattr(&(bti->bti_itup), 1,
index->rd_att, &isnull);
printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into block %d\n",
npass, nruns, d, t,
BufferGetBlockNumber(state->btps_buf));
}
#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
}
else
{
if (SPCLEFT(otape) < btisz)
{
/*
* if it's full, write it out and add the item
* to the next block. (since we will be
* adding another tuple immediately after
* this, we can be sure that there will be at
* least one more block in this run and so we
* know we do *not* want to set End-Of-Run
* here.)
*/
_bt_tapewrite(otape, 0);
}
_bt_tapeadd(otape, bti, btisz);
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
{
bool isnull;
Datum d = index_getattr(&(bti->bti_itup), 1,
index->rd_att, &isnull);
printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into output tape %d\n",
npass, nruns, d, t,
btspool->bts_tape);
}
#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
}
if (btsk.btsk_datum != (Datum *) NULL)
pfree((void *) (btsk.btsk_datum));
if (btsk.btsk_nulls != (char *) NULL)
pfree((void *) (btsk.btsk_nulls));
}
itape = btspool->bts_itape[t];
if (!tapedone[t])
{
BTItem newbti = _bt_tapenext(itape, &tapepos[t]);
if (newbti == (BTItem) NULL)
{
do
{
if (_bt_taperead(itape) == 0)
tapedone[t] = 1;
} while (!tapedone[t] && EMPTYTAPE(itape));
if (!tapedone[t])
{
tapepos[t] = itape->bttb_data;
newbti = _bt_tapenext(itape, &tapepos[t]);
}
}
if (newbti != (BTItem) NULL)
{
BTPriQueueElem nexte;
nexte.btpqe_tape = t;
_bt_setsortkey(index, newbti, &(nexte.btpqe_item));
_bt_pqadd(&q, &nexte);
}
}
} /* item */
/*
* that's it for this run. flush the output tape, marking
* End-of-Run.
*/
_bt_tapewrite(otape, 1);
} /* run */
state = _bt_pagestate(index, BTP_LEAF, 0, true);
/*
* we are here because we ran out of input on all of the input
* tapes.
*
* if this pass did not generate more actual output runs than we have
* tapes, we know we have at most one run in each tape. this
* means that we are ready to merge into the final btree leaf
* pages instead of merging into a tape file.
*/
if (nruns <= btspool->bts_ntapes)
doleaf = true;
} while (nruns > 0); /* pass */
for (;;)
{
bti = (BTItem) tuplesort_getindextuple(btspool->sortstate, true,
&should_free);
if (bti == (BTItem) NULL)
break;
_bt_buildadd(index, state, bti, BTP_LEAF);
if (should_free)
pfree((void *) bti);
}
_bt_uppershutdown(index, state);
}
......@@ -1359,7 +559,7 @@ _bt_upperbuild(Relation index)
ropaque->btpo_flags &= ~BTP_ROOT;
_bt_wrtbuf(index, rbuf);
state = (BTPageState *) _bt_pagestate(index, 0, 0, true);
state = _bt_pagestate(index, 0, 0, true);
/* for each page... */
do
......@@ -1380,7 +580,7 @@ _bt_upperbuild(Relation index)
* the lower page and insert it into a page at this level.
*/
nbti = _bt_minitem(rpage, blk, P_RIGHTMOST(ropaque));
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
#ifdef FASTBUILD_DEBUG
{
bool isnull;
Datum d = index_getattr(&(nbti->bti_itup), 1, index->rd_att,
......@@ -1389,7 +589,7 @@ _bt_upperbuild(Relation index)
printf("_bt_upperbuild: inserting <%x> at %d\n",
d, state->btps_level);
}
#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
#endif
_bt_buildadd(index, state, nbti, 0);
pfree((void *) nbti);
}
......@@ -1401,25 +601,3 @@ _bt_upperbuild(Relation index)
}
#endif
/*
* given a spool loading by successive calls to _bt_spool, create an
* entire btree.
*/
void
_bt_leafbuild(Relation index, void *spool)
{
_bt_isortcmpinit(index, (BTSpool *) spool);
#ifdef BTREE_BUILD_STATS
if (ShowExecutorStats)
{
fprintf(stderr, "! BtreeBuild (Spool) Stats:\n");
ShowUsage();
ResetUsage();
}
#endif
_bt_merge(index, (BTSpool *) spool);
}
......@@ -7,16 +7,17 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/executor/nodeSort.c,v 1.23 1999/07/17 20:16:58 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/executor/nodeSort.c,v 1.24 1999/10/17 22:15:02 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "executor/executor.h"
#include "executor/execdebug.h"
#include "executor/nodeSort.h"
#include "utils/psort.h"
#include "utils/tuplesort.h"
/* ----------------------------------------------------------------
* FormSortKeys(node)
......@@ -83,11 +84,9 @@ FormSortKeys(Sort *sortnode)
/* ----------------------------------------------------------------
* ExecSort
*
* old comments
* Sorts tuples from the outer subtree of the node in psort,
* Sorts tuples from the outer subtree of the node using tuplesort,
* which saves the results in a temporary file or memory. After the
* initial call, returns a tuple from the file with each call.
* Assumes that heap access method is used.
*
* Conditions:
* -- none.
......@@ -101,10 +100,8 @@ ExecSort(Sort *node)
{
EState *estate;
SortState *sortstate;
Plan *outerNode;
ScanDirection dir;
int keycount;
ScanKey sortkeys;
Tuplesortstate *tuplesortstate;
HeapTuple heapTuple;
TupleTableSlot *slot;
bool should_free;
......@@ -119,43 +116,71 @@ ExecSort(Sort *node)
sortstate = node->sortstate;
estate = node->plan.state;
dir = estate->es_direction;
tuplesortstate = (Tuplesortstate *) sortstate->tuplesortstate;
/* ----------------
* the first time we call this, psort sorts this into a file.
* Subsequent calls return tuples from psort.
* If first time through, read all tuples from outer plan and
* pass them to tuplesort.c.
* Subsequent calls just fetch tuples from tuplesort.
* ----------------
*/
if (sortstate->sort_Flag == false)
if (! sortstate->sort_Done)
{
Plan *outerNode;
TupleDesc tupDesc;
int keycount;
ScanKey sortkeys;
SO1_printf("ExecSort: %s\n",
"sortstate == false -> sorting subplan");
"sorting subplan");
/* ----------------
* set all relations to be scanned in the forward direction
* while creating the temporary relation.
* Want to scan subplan in the forward direction while creating
* the sorted data. (Does setting my direction actually affect
* the subplan? I bet this is useless code...)
* ----------------
*/
estate->es_direction = ForwardScanDirection;
/* ----------------
* prepare information for psort_begin()
* Initialize tuplesort module.
* ----------------
*/
outerNode = outerPlan((Plan *) node);
SO1_printf("ExecSort: %s\n",
"calling tuplesort_begin");
outerNode = outerPlan((Plan *) node);
tupDesc = ExecGetTupType(outerNode);
keycount = node->keycount;
sortkeys = (ScanKey) sortstate->sort_Keys;
SO1_printf("ExecSort: %s\n",
"calling psort_begin");
if (!psort_begin(node, /* this node */
keycount, /* number keys */
sortkeys)) /* keys */
tuplesortstate = tuplesort_begin_heap(tupDesc, keycount, sortkeys,
true /* randomAccess */);
sortstate->tuplesortstate = (void *) tuplesortstate;
/* ----------------
* Scan the subplan and feed all the tuples to tuplesort.
* ----------------
*/
for (;;)
{
/* Psort says, there are no tuples to be sorted */
return NULL;
slot = ExecProcNode(outerNode, (Plan *) node);
if (TupIsNull(slot))
break;
tuplesort_puttuple(tuplesortstate, (void *) slot->val);
ExecClearTuple(slot);
}
/* ----------------
* Complete the sort.
* ----------------
*/
tuplesort_performsort(tuplesortstate);
/* ----------------
* restore to user specified direction
* ----------------
......@@ -167,25 +192,29 @@ ExecSort(Sort *node)
* ----------------
*/
slot = (TupleTableSlot *) sortstate->csstate.cstate.cs_ResultTupleSlot;
slot->ttc_tupleDescriptor = ExecGetTupType(outerNode);
slot->ttc_tupleDescriptor = tupDesc;
/* ----------------
* finally set the sorted flag to true
* ----------------
*/
sortstate->sort_Flag = true;
sortstate->sort_Done = true;
SO1_printf(stderr, "ExecSort: sorting done.\n");
}
else
slot = (TupleTableSlot *) sortstate->csstate.cstate.cs_ResultTupleSlot;
SO1_printf("ExecSort: %s\n",
"retrieving tuple from sorted relation");
"retrieving tuple from tuplesort");
/* ----------------
* at this point we grab a tuple from psort
* Get the first or next tuple from tuplesort.
* Returns NULL if no more tuples.
* ----------------
*/
heapTuple = psort_grabtuple(node, &should_free);
heapTuple = tuplesort_getheaptuple(tuplesortstate,
ScanDirectionIsForward(dir),
&should_free);
return ExecStoreTuple(heapTuple, slot, InvalidBuffer, should_free);
}
......@@ -193,7 +222,6 @@ ExecSort(Sort *node)
/* ----------------------------------------------------------------
* ExecInitSort
*
* old comments
* Creates the run-time state information for the sort node
* produced by the planner and initailizes its outer subtree.
* ----------------------------------------------------------------
......@@ -203,7 +231,6 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent)
{
SortState *sortstate;
Plan *outerPlan;
ScanKey sortkeys;
SO1_printf("ExecInitSort: %s\n",
"initializing sort node");
......@@ -219,14 +246,14 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent)
* ----------------
*/
sortstate = makeNode(SortState);
sortstate->sort_Flag = 0;
sortstate->sort_Done = false;
sortstate->sort_Keys = NULL;
node->cleaned = FALSE;
sortstate->tuplesortstate = NULL;
node->sortstate = sortstate;
/* ----------------
* Miscellanious initialization
* Miscellaneous initialization
*
* + assign node's base_id
* + assign debugging hooks
......@@ -259,9 +286,7 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent)
* initialize sortstate information
* ----------------
*/
sortkeys = FormSortKeys(node);
sortstate->sort_Keys = sortkeys;
sortstate->sort_Flag = false;
sortstate->sort_Keys = FormSortKeys(node);
/* ----------------
* initialize tuple type. no need to initialize projection
......@@ -275,11 +300,6 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent)
SO1_printf("ExecInitSort: %s\n",
"sort node initialized");
/* ----------------
* return relation oid of temporary sort relation in a list
* (someday -- for now we return LispTrue... cim 10/12/89)
* ----------------
*/
return TRUE;
}
......@@ -293,8 +313,6 @@ ExecCountSlotsSort(Sort *node)
/* ----------------------------------------------------------------
* ExecEndSort(node)
*
* old comments
* ----------------------------------------------------------------
*/
void
......@@ -325,8 +343,13 @@ ExecEndSort(Sort *node)
*/
ExecClearTuple(sortstate->csstate.css_ScanTupleSlot);
/* Clean up after psort */
psort_end(node);
/* ----------------
* Release tuplesort resources
* ----------------
*/
if (sortstate->tuplesortstate != NULL)
tuplesort_end((Tuplesortstate *) sortstate->tuplesortstate);
sortstate->tuplesortstate = NULL;
SO1_printf("ExecEndSort: %s\n",
"sort node shutdown");
......@@ -335,51 +358,47 @@ ExecEndSort(Sort *node)
/* ----------------------------------------------------------------
* ExecSortMarkPos
*
* Calls psort to save the current position in the sorted file.
* Calls tuplesort to save the current position in the sorted file.
* ----------------------------------------------------------------
*/
void
ExecSortMarkPos(Sort *node)
{
SortState *sortstate;
SortState *sortstate = node->sortstate;
/* ----------------
* if we haven't sorted yet, just return
* ----------------
*/
sortstate = node->sortstate;
if (sortstate->sort_Flag == false)
if (! sortstate->sort_Done)
return;
psort_markpos(node);
return;
tuplesort_markpos((Tuplesortstate *) sortstate->tuplesortstate);
}
/* ----------------------------------------------------------------
* ExecSortRestrPos
*
* Calls psort to restore the last saved sort file position.
* Calls tuplesort to restore the last saved sort file position.
* ----------------------------------------------------------------
*/
void
ExecSortRestrPos(Sort *node)
{
SortState *sortstate;
SortState *sortstate = node->sortstate;
/* ----------------
* if we haven't sorted yet, just return.
* ----------------
*/
sortstate = node->sortstate;
if (sortstate->sort_Flag == false)
if (! sortstate->sort_Done)
return;
/* ----------------
* restore the scan to the previously marked position
* ----------------
*/
psort_restorepos(node);
tuplesort_restorepos((Tuplesortstate *) sortstate->tuplesortstate);
}
void
......@@ -392,17 +411,25 @@ ExecReScanSort(Sort *node, ExprContext *exprCtxt, Plan *parent)
* not NULL then it will be re-scanned by ExecProcNode, else - no
* reason to re-scan it at all.
*/
if (sortstate->sort_Flag == false)
if (! sortstate->sort_Done)
return;
ExecClearTuple(sortstate->csstate.cstate.cs_ResultTupleSlot);
psort_rescan(node);
/*
* If subnode is to be rescanned then we aren't sorted
* If subnode is to be rescanned then we forget previous sort
* results; we have to re-read the subplan and re-sort.
*
* Otherwise we can just rewind and rescan the sorted output.
*/
if (((Plan *) node)->lefttree->chgParam != NULL)
sortstate->sort_Flag = false;
{
sortstate->sort_Done = false;
tuplesort_end((Tuplesortstate *) sortstate->tuplesortstate);
sortstate->tuplesortstate = NULL;
}
else
{
tuplesort_rescan((Tuplesortstate *) sortstate->tuplesortstate);
}
}
......@@ -4,7 +4,7 @@
# Makefile for utils/sort
#
# IDENTIFICATION
# $Header: /cvsroot/pgsql/src/backend/utils/sort/Makefile,v 1.6 1999/10/16 19:49:27 tgl Exp $
# $Header: /cvsroot/pgsql/src/backend/utils/sort/Makefile,v 1.7 1999/10/17 22:15:05 tgl Exp $
#
#-------------------------------------------------------------------------
......@@ -13,7 +13,7 @@ include ../../../Makefile.global
CFLAGS += -I../..
OBJS = logtape.o lselect.o psort.o
OBJS = logtape.o tuplesort.o
all: SUBSYS.o
......
......@@ -4,8 +4,8 @@
* Management of "logical tapes" within temporary files.
*
* This module exists to support sorting via multiple merge passes (see
* psort.c). Merging is an ideal algorithm for tape devices, but if we
* implement it on disk by creating a separate file for each "tape",
* tuplesort.c). Merging is an ideal algorithm for tape devices, but if
* we implement it on disk by creating a separate file for each "tape",
* there is an annoying problem: the peak space usage is at least twice
* the volume of actual data to be sorted. (This must be so because each
* datum will appear in both the input and output tapes of the final
......@@ -23,7 +23,7 @@
* Few OSes allow arbitrary parts of a file to be released back to the OS,
* so we have to implement this space-recycling ourselves within a single
* logical file. logtape.c exists to perform this bookkeeping and provide
* the illusion of N independent tape devices to psort.c. Note that
* the illusion of N independent tape devices to tuplesort.c. Note that
* logtape.c itself depends on buffile.c to provide a "logical file" of
* larger size than the underlying OS may support.
*
......@@ -63,7 +63,7 @@
* Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/utils/sort/logtape.c,v 1.1 1999/10/16 19:49:27 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/utils/sort/logtape.c,v 1.2 1999/10/17 22:15:05 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......
/*-------------------------------------------------------------------------
*
* tuplesort.c
* Generalized tuple sorting routines.
*
* This module handles sorting of either heap tuples or index tuples
* (and could fairly easily support other kinds of sortable objects,
* if necessary). It works efficiently for both small and large amounts
* of data. Small amounts are sorted in-memory using qsort(). Large
* amounts are sorted using temporary files and a standard external sort
* algorithm.
*
* See Knuth, volume 3, for more than you want to know about the external
* sorting algorithm. We divide the input into sorted runs using replacement
* selection, in the form of a priority tree implemented as a heap
* (essentially his Algorithm 5.2.3H), then merge the runs using polyphase
* merge, Knuth's Algorithm 5.4.2D. The logical "tapes" used by Algorithm D
* are implemented by logtape.c, which avoids space wastage by recycling
* disk space as soon as each block is read from its "tape".
*
* We do not form the initial runs using Knuth's recommended replacement
* selection method (Algorithm 5.4.1R), because it uses a fixed number of
* records in memory at all times. Since we are dealing with tuples that
* may vary considerably in size, we want to be able to vary the number of
* records kept in memory to ensure full utilization of the allowed sort
* memory space. This is easily done by keeping a variable-size heap in
* which the records of the current run are stored, plus a variable-size
* unsorted array holding records that must go into the next run.
*
* The (approximate) amount of memory allowed for any one sort operation
* is given in kilobytes by the external variable SortMem. Initially,
* we absorb tuples and simply store them in an unsorted array as long as
* we haven't exceeded SortMem. If we reach the end of the input without
* exceeding SortMem, we sort the array using qsort() and subsequently return
* tuples just by scanning the tuple array sequentially. If we do exceed
* SortMem, we construct a heap using Algorithm H and begin to emit tuples
* into sorted runs in temporary tapes, emitting just enough tuples at each
* step to get back within the SortMem limit. New tuples are added to the
* heap if they can go into the current run, else they are temporarily added
* to the unsorted array. Whenever the heap empties, we construct a new heap
* from the current contents of the unsorted array, and begin a new run with a
* new output tape (selected per Algorithm D). After the end of the input
* is reached, we dump out remaining tuples in memory into a final run
* (or two), then merge the runs using Algorithm D.
*
* When the caller requests random access to the sort result, we form
* the final sorted run on a logical tape which is then "frozen", so
* that we can access it randomly. When the caller does not need random
* access, we return from tuplesort_performsort() as soon as we are down
* to one run per logical tape. The final merge is then performed
* on-the-fly as the caller repeatedly calls tuplesort_gettuple; this
* saves one cycle of writing all the data out to disk and reading it in.
*
*
* Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/utils/sort/tuplesort.c,v 1.1 1999/10/17 22:15:05 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/heapam.h"
#include "access/nbtree.h"
#include "miscadmin.h"
#include "utils/logtape.h"
#include "utils/tuplesort.h"
/*
* Possible states of a Tuplesort object. These denote the states that
* persist between calls of Tuplesort routines.
*/
typedef enum
{
TSS_INITIAL, /* Loading tuples; still within memory limit */
TSS_BUILDRUNS, /* Loading tuples; writing to tape */
TSS_SORTEDINMEM, /* Sort completed entirely in memory */
TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */
TSS_FINALMERGE /* Performing final merge on-the-fly */
} TupSortStatus;
/*
* We use a seven-tape polyphase merge, which is the "sweet spot" on the
* tapes-to-passes curve according to Knuth's figure 70 (section 5.4.2).
*/
#define MAXTAPES 7 /* Knuth's T */
#define TAPERANGE (MAXTAPES-1) /* Knuth's P */
/*
* Private state of a Tuplesort operation.
*/
struct Tuplesortstate
{
TupSortStatus status; /* enumerated value as shown above */
bool randomAccess; /* did caller request random access? */
long availMem; /* remaining memory available, in bytes */
LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */
/*
* These function pointers decouple the routines that must know what kind
* of tuple we are sorting from the routines that don't need to know it.
* They are set up by the tuplesort_begin_xxx routines.
*
* Function to compare two tuples; result is per qsort() convention,
* ie, <0, 0, >0 according as a<b, a=b, a>b.
*/
int (*comparetup) (Tuplesortstate *state, const void *a, const void *b);
/*
* Function to copy a supplied input tuple into palloc'd space.
* (NB: we assume that a single pfree() is enough to release the tuple
* later, so the representation must be "flat" in one palloc chunk.)
* state->availMem must be decreased by the amount of space used.
*/
void * (*copytup) (Tuplesortstate *state, void *tup);
/*
* Function to write a stored tuple onto tape. The representation of
* the tuple on tape need not be the same as it is in memory; requirements
* on the tape representation are given below. After writing the tuple,
* pfree() it, and increase state->availMem by the amount of memory space
* thereby released.
*/
void (*writetup) (Tuplesortstate *state, int tapenum, void *tup);
/*
* Function to read a stored tuple from tape back into memory.
* 'len' is the already-read length of the stored tuple. Create and
* return a palloc'd copy, and decrease state->availMem by the amount
* of memory space consumed.
*/
void * (*readtup) (Tuplesortstate *state, int tapenum, unsigned int len);
/*
* This array holds "unsorted" tuples during the input phases.
* If we are able to complete the sort in memory, it holds the
* final sorted result as well.
*/
void **memtuples; /* array of pointers to palloc'd tuples */
int memtupcount; /* number of tuples currently present */
int memtupsize; /* allocated length of memtuples array */
/*
* This array holds the partially-sorted "heap" of tuples that will go
* out in the current run during BUILDRUNS state. While completing
* the sort, we use it to merge runs of tuples from input tapes.
* It is never allocated unless we need to use tapes.
*/
void **heaptuples; /* array of pointers to palloc'd tuples */
int heaptupcount; /* number of tuples currently present */
int heaptupsize; /* allocated length of heaptuples array */
/*
* While merging, this array holds the actual number of the input tape
* that each tuple in heaptuples[] came from.
*/
int *heapsrctapes;
/*
* Variables for Algorithm D. Note that destTape is a "logical" tape
* number, ie, an index into the tp_xxx[] arrays. Be careful to keep
* "logical" and "actual" tape numbers straight!
*/
int Level; /* Knuth's l */
int destTape; /* current output tape (Knuth's j, less 1) */
int tp_fib[MAXTAPES]; /* Target Fibonacci run counts (A[]) */
int tp_runs[MAXTAPES]; /* # of real runs on each tape */
int tp_dummy[MAXTAPES]; /* # of dummy runs for each tape (D[]) */
int tp_tapenum[MAXTAPES]; /* Actual tape numbers (TAPE[]) */
bool multipleRuns; /* T if we have created more than 1 run */
/*
* These variables are used after completion of sorting to keep track
* of the next tuple to return. (In the tape case, the tape's current
* read position is also critical state.)
*/
int result_tape; /* actual tape number of finished output */
int current; /* array index (only used if SORTEDINMEM) */
bool eof_reached; /* reached EOF (needed for cursors) */
/* markpos_xxx holds marked position for mark and restore */
long markpos_block; /* tape block# (only used if SORTEDONTAPE) */
int markpos_offset; /* saved "current", or offset in tape block */
bool markpos_eof; /* saved "eof_reached" */
/*
* These variables are specific to the HeapTuple case; they are set
* by tuplesort_begin_heap and used only by the HeapTuple routines.
*/
TupleDesc tupDesc;
int nKeys;
ScanKey scanKeys;
/*
* These variables are specific to the IndexTuple case; they are set
* by tuplesort_begin_index and used only by the IndexTuple routines.
*/
Relation indexRel;
bool enforceUnique; /* complain if we find duplicate tuples */
};
#define COMPARETUP(state,a,b) ((*(state)->comparetup) (state, a, b))
#define COPYTUP(state,tup) ((*(state)->copytup) (state, tup))
#define WRITETUP(state,tape,tup) ((*(state)->writetup) (state, tape, tup))
#define READTUP(state,tape,len) ((*(state)->readtup) (state, tape, len))
#define LACKMEM(state) ((state)->availMem < 0)
#define USEMEM(state,amt) ((state)->availMem -= (amt))
#define FREEMEM(state,amt) ((state)->availMem += (amt))
/*--------------------
*
* NOTES about on-tape representation of tuples:
*
* We require the first "unsigned int" of a stored tuple to be the total size
* on-tape of the tuple, including itself (so it is never zero; an all-zero
* unsigned int is used to delimit runs). The remainder of the stored tuple
* may or may not match the in-memory representation of the tuple ---
* any conversion needed is the job of the writetup and readtup routines.
*
* If state->randomAccess is true, then the stored representation of the
* tuple must be followed by another "unsigned int" that is a copy of the
* length --- so the total tape space used is actually sizeof(unsigned int)
* more than the stored length value. This allows read-backwards. When
* randomAccess is not true, the write/read routines may omit the extra
* length word.
*
* writetup is expected to write both length words as well as the tuple
* data. When readtup is called, the tape is positioned just after the
* front length word; readtup must read the tuple data and advance past
* the back length word (if present).
*
* The write/read routines can make use of the tuple description data
* stored in the Tuplesortstate record, if needed. They are also expected
* to adjust state->availMem by the amount of memory space (not tape space!)
* released or consumed. There is no error return from either writetup
* or readtup; they should elog() on failure.
*
*
* NOTES about memory consumption calculations:
*
* We count space requested for tuples against the SortMem limit.
* Fixed-size space (primarily the LogicalTapeSet I/O buffers) is not
* counted, nor do we count the variable-size memtuples and heaptuples
* arrays. (Even though those could grow pretty large, they should be
* small compared to the tuples proper, so this is not unreasonable.)
*
* The major deficiency in this approach is that it ignores palloc overhead.
* The memory space actually allocated for a palloc chunk is always more
* than the request size, and could be considerably more (as much as 2X
* larger, in the current aset.c implementation). So the space used could
* be considerably more than SortMem says.
*
* One way to fix this is to add a memory management function that, given
* a pointer to a palloc'd chunk, returns the actual space consumed by the
* chunk. This would be very easy in the current aset.c module, but I'm
* hesitant to do it because it might be unpleasant to support in future
* implementations of memory management. (For example, a direct
* implementation of palloc as malloc could not support such a function
* portably.)
*
* A cruder answer is just to apply a fudge factor, say by initializing
* availMem to only three-quarters of what SortMem indicates. This is
* probably the right answer if anyone complains that SortMem is not being
* obeyed very faithfully.
*
*--------------------
*/
static Tuplesortstate *tuplesort_begin_common(bool randomAccess);
static void inittapes(Tuplesortstate *state);
static void selectnewtape(Tuplesortstate *state);
static void mergeruns(Tuplesortstate *state);
static void mergeonerun(Tuplesortstate *state);
static void beginmerge(Tuplesortstate *state);
static void beginrun(Tuplesortstate *state);
static void dumptuples(Tuplesortstate *state, bool alltuples);
static void tuplesort_heap_insert(Tuplesortstate *state, void *tuple,
int tapenum);
static void tuplesort_heap_siftup(Tuplesortstate *state);
static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
static void markrunend(Tuplesortstate *state, int tapenum);
static int qsort_comparetup(const void *a, const void *b);
static int comparetup_heap(Tuplesortstate *state,
const void *a, const void *b);
static void *copytup_heap(Tuplesortstate *state, void *tup);
static void writetup_heap(Tuplesortstate *state, int tapenum, void *tup);
static void *readtup_heap(Tuplesortstate *state, int tapenum,
unsigned int len);
static int comparetup_index(Tuplesortstate *state,
const void *a, const void *b);
static void *copytup_index(Tuplesortstate *state, void *tup);
static void writetup_index(Tuplesortstate *state, int tapenum, void *tup);
static void *readtup_index(Tuplesortstate *state, int tapenum,
unsigned int len);
/*
* Since qsort(3) will not pass any context info to qsort_comparetup(),
* we have to use this ugly static variable. It is set to point to the
* active Tuplesortstate object just before calling qsort. It should
* not be used directly by anything except qsort_comparetup().
*/
static Tuplesortstate *qsort_tuplesortstate;
/*
* tuplesort_begin_xxx
*
* Initialize for a tuple sort operation.
*
* After calling tuplesort_begin, the caller should call tuplesort_puttuple
* zero or more times, then call tuplesort_performsort when all the tuples
* have been supplied. After performsort, retrieve the tuples in sorted
* order by calling tuplesort_gettuple until it returns NULL. (If random
* access was requested, rescan, markpos, and restorepos can also be called.)
* Call tuplesort_end to terminate the operation and release memory/disk space.
*/
static Tuplesortstate *
tuplesort_begin_common(bool randomAccess)
{
Tuplesortstate *state;
state = (Tuplesortstate *) palloc(sizeof(Tuplesortstate));
MemSet((char *) state, 0, sizeof(Tuplesortstate));
state->status = TSS_INITIAL;
state->randomAccess = randomAccess;
state->availMem = SortMem * 1024L;
state->tapeset = NULL;
state->memtupcount = 0;
state->memtupsize = 1024; /* initial guess */
state->memtuples = (void **) palloc(state->memtupsize * sizeof(void *));
state->heaptuples = NULL; /* until and unless needed */
state->heaptupcount = 0;
state->heaptupsize = 0;
state->heapsrctapes = NULL;
/* Algorithm D variables will be initialized by inittapes, if needed */
state->result_tape = -1; /* flag that result tape has not been formed */
return state;
}
Tuplesortstate *
tuplesort_begin_heap(TupleDesc tupDesc,
int nkeys, ScanKey keys,
bool randomAccess)
{
Tuplesortstate *state = tuplesort_begin_common(randomAccess);
AssertArg(nkeys >= 1);
AssertArg(keys[0].sk_attno != 0);
AssertArg(keys[0].sk_procedure != 0);
state->comparetup = comparetup_heap;
state->copytup = copytup_heap;
state->writetup = writetup_heap;
state->readtup = readtup_heap;
state->tupDesc = tupDesc;
state->nKeys = nkeys;
state->scanKeys = keys;
return state;
}
Tuplesortstate *
tuplesort_begin_index(Relation indexRel,
bool enforceUnique,
bool randomAccess)
{
Tuplesortstate *state = tuplesort_begin_common(randomAccess);
state->comparetup = comparetup_index;
state->copytup = copytup_index;
state->writetup = writetup_index;
state->readtup = readtup_index;
state->indexRel = indexRel;
state->enforceUnique = enforceUnique;
return state;
}
/*
* tuplesort_end
*
* Release resources and clean up.
*/
void
tuplesort_end(Tuplesortstate *state)
{
int i;
if (state->tapeset)
LogicalTapeSetClose(state->tapeset);
if (state->memtuples)
{
for (i = 0; i < state->memtupcount; i++)
pfree(state->memtuples[i]);
pfree(state->memtuples);
}
if (state->heaptuples)
{
for (i = 0; i < state->heaptupcount; i++)
pfree(state->heaptuples[i]);
pfree(state->heaptuples);
}
if (state->heapsrctapes)
pfree(state->heapsrctapes);
}
/*
* Accept one tuple while collecting input data for sort.
*
* Note that the input tuple is always copied; the caller need not save it.
*/
void
tuplesort_puttuple(Tuplesortstate *state, void *tuple)
{
/*
* Copy the given tuple into memory we control, and decrease availMem.
*/
tuple = COPYTUP(state, tuple);
switch (state->status)
{
case TSS_INITIAL:
/*
* Save the copied tuple into the unsorted array.
*/
if (state->memtupcount >= state->memtupsize)
{
/* Grow the unsorted array as needed. */
state->memtupsize *= 2;
state->memtuples = (void **)
repalloc(state->memtuples,
state->memtupsize * sizeof(void *));
}
state->memtuples[state->memtupcount++] = tuple;
/*
* Done if we still fit in available memory.
*/
if (! LACKMEM(state))
return;
/*
* Nope; time to switch to tape-based operation.
*/
inittapes(state);
beginrun(state);
/*
* Dump tuples until we are back under the limit.
*/
dumptuples(state, false);
break;
case TSS_BUILDRUNS:
/*
* Insert the copied tuple into the heap if it can go into the
* current run; otherwise add it to the unsorted array, whence
* it will go into the next run.
*
* The tuple can go into the current run if it is >= the first
* not-yet-output tuple. (Actually, it could go into the current
* run if it is >= the most recently output tuple ... but that
* would require keeping around the tuple we last output, and
* it's simplest to let writetup free the tuple when written.)
*
* Note there will always be at least one tuple in the heap
* at this point; see dumptuples.
*/
Assert(state->heaptupcount > 0);
if (COMPARETUP(state, tuple, state->heaptuples[0]) >= 0)
{
tuplesort_heap_insert(state, tuple, 0);
}
else
{
if (state->memtupcount >= state->memtupsize)
{
/* Grow the unsorted array as needed. */
state->memtupsize *= 2;
state->memtuples = (void **)
repalloc(state->memtuples,
state->memtupsize * sizeof(void *));
}
state->memtuples[state->memtupcount++] = tuple;
}
/*
* If we are over the memory limit, dump tuples till we're under.
*/
dumptuples(state, false);
break;
default:
elog(ERROR, "tuplesort_puttuple: invalid state");
break;
}
}
/*
* All tuples have been provided; finish the sort.
*/
void
tuplesort_performsort(Tuplesortstate *state)
{
switch (state->status)
{
case TSS_INITIAL:
/*
* We were able to accumulate all the tuples within the
* allowed amount of memory. Just qsort 'em and we're done.
*/
if (state->memtupcount > 1)
{
qsort_tuplesortstate = state;
qsort((void *) state->memtuples, state->memtupcount,
sizeof(void *), qsort_comparetup);
}
state->current = 0;
state->eof_reached = false;
state->markpos_offset = 0;
state->markpos_eof = false;
state->status = TSS_SORTEDINMEM;
break;
case TSS_BUILDRUNS:
/*
* Finish tape-based sort. First, flush all tuples remaining
* in memory out to tape; then merge until we have a single
* remaining run (or, if !randomAccess, one run per tape).
* Note that mergeruns sets the correct status.
*/
dumptuples(state, true);
mergeruns(state);
state->eof_reached = false;
state->markpos_block = 0L;
state->markpos_offset = 0;
state->markpos_eof = false;
break;
default:
elog(ERROR, "tuplesort_performsort: invalid state");
break;
}
}
/*
* Fetch the next tuple in either forward or back direction.
* Returns NULL if no more tuples. If should_free is set, the
* caller must pfree the returned tuple when done with it.
*/
void *
tuplesort_gettuple(Tuplesortstate *state, bool forward,
bool *should_free)
{
unsigned int tuplen;
void *tup;
switch (state->status)
{
case TSS_SORTEDINMEM:
Assert(forward || state->randomAccess);
*should_free = false;
if (forward)
{
if (state->current < state->memtupcount)
return state->memtuples[state->current++];
state->eof_reached = true;
return NULL;
}
else
{
if (state->current <= 0)
return NULL;
/*
* if all tuples are fetched already then we return last tuple,
* else - tuple before last returned.
*/
if (state->eof_reached)
state->eof_reached = false;
else
{
state->current--; /* last returned tuple */
if (state->current <= 0)
return NULL;
}
return state->memtuples[state->current - 1];
}
break;
case TSS_SORTEDONTAPE:
Assert(forward || state->randomAccess);
*should_free = true;
if (forward)
{
if (state->eof_reached)
return NULL;
if ((tuplen = getlen(state, state->result_tape, true)) != 0)
{
tup = READTUP(state, state->result_tape, tuplen);
return tup;
}
else
{
state->eof_reached = true;
return NULL;
}
}
/* Backward.
*
* if all tuples are fetched already then we return last tuple,
* else - tuple before last returned.
*/
if (state->eof_reached)
{
/*
* Seek position is pointing just past the zero tuplen
* at the end of file; back up to fetch last tuple's ending
* length word. If seek fails we must have a completely empty
* file.
*/
if (! LogicalTapeBackspace(state->tapeset,
state->result_tape,
2 * sizeof(unsigned int)))
return NULL;
state->eof_reached = false;
}
else
{
/*
* Back up and fetch previously-returned tuple's ending length
* word. If seek fails, assume we are at start of file.
*/
if (! LogicalTapeBackspace(state->tapeset,
state->result_tape,
sizeof(unsigned int)))
return NULL;
tuplen = getlen(state, state->result_tape, false);
/*
* Back up to get ending length word of tuple before it.
*/
if (! LogicalTapeBackspace(state->tapeset,
state->result_tape,
tuplen + 2 * sizeof(unsigned int)))
{
/* If that fails, presumably the prev tuple is the first
* in the file. Back up so that it becomes next to read
* in forward direction (not obviously right, but that is
* what in-memory case does).
*/
if (! LogicalTapeBackspace(state->tapeset,
state->result_tape,
tuplen + sizeof(unsigned int)))
elog(ERROR, "tuplesort_gettuple: bogus tuple len in backward scan");
return NULL;
}
}
tuplen = getlen(state, state->result_tape, false);
/*
* Now we have the length of the prior tuple, back up and read it.
* Note: READTUP expects we are positioned after the initial
* length word of the tuple, so back up to that point.
*/
if (! LogicalTapeBackspace(state->tapeset,
state->result_tape,
tuplen))
elog(ERROR, "tuplesort_gettuple: bogus tuple len in backward scan");
tup = READTUP(state, state->result_tape, tuplen);
return tup;
case TSS_FINALMERGE:
Assert(forward);
*should_free = true;
/*
* This code should match the inner loop of mergeonerun().
*/
if (state->heaptupcount > 0)
{
int srcTape = state->heapsrctapes[0];
tup = state->heaptuples[0];
tuplesort_heap_siftup(state);
if ((tuplen = getlen(state, srcTape, true)) != 0)
{
void *newtup = READTUP(state, srcTape, tuplen);
tuplesort_heap_insert(state, newtup, srcTape);
}
return tup;
}
return NULL;
default:
elog(ERROR, "tuplesort_gettuple: invalid state");
return NULL; /* keep compiler quiet */
}
}
/*
* inittapes - initialize for tape sorting.
*
* This is called only if we have found we don't have room to sort in memory.
*/
static void
inittapes(Tuplesortstate *state)
{
int j;
state->tapeset = LogicalTapeSetCreate(MAXTAPES);
/*
* Initialize heaptuples array slightly larger than current memtuples
* usage; memtupcount is probably a good guess at how many tuples we
* will be able to have in the heap at once.
*/
state->heaptupcount = 0;
state->heaptupsize = state->memtupcount + state->memtupcount / 4;
state->heaptuples = (void **) palloc(state->heaptupsize * sizeof(void *));
/*
* Initialize variables of Algorithm D (step D1).
*/
for (j = 0; j < MAXTAPES; j++)
{
state->tp_fib[j] = 1;
state->tp_runs[j] = 0;
state->tp_dummy[j] = 1;
state->tp_tapenum[j] = j;
}
state->tp_fib[TAPERANGE] = 0;
state->tp_dummy[TAPERANGE] = 0;
state->Level = 1;
state->destTape = 0;
state->multipleRuns = false;
state->status = TSS_BUILDRUNS;
}
/*
* selectnewtape -- select new tape for new initial run.
*
* This is called after finishing a run when we know another run
* must be started. This implements steps D3, D4 of Algorithm D.
*/
static void
selectnewtape(Tuplesortstate *state)
{
int j;
int a;
/* We now have at least two initial runs */
state->multipleRuns = true;
/* Step D3: advance j (destTape) */
if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape+1])
{
state->destTape++;
return;
}
if (state->tp_dummy[state->destTape] != 0)
{
state->destTape = 0;
return;
}
/* Step D4: increase level */
state->Level++;
a = state->tp_fib[0];
for (j = 0; j < TAPERANGE; j++)
{
state->tp_dummy[j] = a + state->tp_fib[j+1] - state->tp_fib[j];
state->tp_fib[j] = a + state->tp_fib[j+1];
}
state->destTape = 0;
}
/*
* mergeruns -- merge all the completed initial runs.
*
* This implements steps D5, D6 of Algorithm D. All input data has
* already been written to initial runs on tape (see dumptuples).
*/
static void
mergeruns(Tuplesortstate *state)
{
int tapenum,
svTape,
svRuns,
svDummy;
Assert(state->status == TSS_BUILDRUNS);
Assert(state->memtupcount == 0 && state->heaptupcount == 0);
/*
* If we produced only one initial run (quite likely if the total
* data volume is between 1X and 2X SortMem), we can just use that
* tape as the finished output, rather than doing a useless merge.
*/
if (! state->multipleRuns)
{
state->result_tape = state->tp_tapenum[state->destTape];
/* must freeze and rewind the finished output tape */
LogicalTapeFreeze(state->tapeset, state->result_tape);
state->status = TSS_SORTEDONTAPE;
return;
}
/* End of step D2: rewind all output tapes to prepare for merging */
for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
LogicalTapeRewind(state->tapeset, tapenum, false);
for (;;)
{
/* Step D5: merge runs onto tape[T] until tape[P] is empty */
while (state->tp_runs[TAPERANGE-1] || state->tp_dummy[TAPERANGE-1])
{
bool allDummy = true;
bool allOneRun = true;
for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
{
if (state->tp_dummy[tapenum] == 0)
allDummy = false;
if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1)
allOneRun = false;
}
/*
* If we don't have to produce a materialized sorted tape,
* quit as soon as we're down to one real/dummy run per tape.
*/
if (! state->randomAccess && allOneRun)
{
Assert(! allDummy);
/* Initialize for the final merge pass */
beginmerge(state);
state->status = TSS_FINALMERGE;
return;
}
if (allDummy)
{
state->tp_dummy[TAPERANGE]++;
for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
state->tp_dummy[tapenum]--;
}
else
{
mergeonerun(state);
}
}
/* Step D6: decrease level */
if (--state->Level == 0)
break;
/* rewind output tape T to use as new input */
LogicalTapeRewind(state->tapeset, state->tp_tapenum[TAPERANGE],
false);
/* rewind used-up input tape P, and prepare it for write pass */
LogicalTapeRewind(state->tapeset, state->tp_tapenum[TAPERANGE-1],
true);
state->tp_runs[TAPERANGE-1] = 0;
/* reassign tape units per step D6; note we no longer care about A[] */
svTape = state->tp_tapenum[TAPERANGE];
svDummy = state->tp_dummy[TAPERANGE];
svRuns = state->tp_runs[TAPERANGE];
for (tapenum = TAPERANGE; tapenum > 0; tapenum--)
{
state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum-1];
state->tp_dummy[tapenum] = state->tp_dummy[tapenum-1];
state->tp_runs[tapenum] = state->tp_runs[tapenum-1];
}
state->tp_tapenum[0] = svTape;
state->tp_dummy[0] = svDummy;
state->tp_runs[0] = svRuns;
}
/*
* Done. Knuth says that the result is on TAPE[1], but since we exited
* the loop without performing the last iteration of step D6, we have not
* rearranged the tape unit assignment, and therefore the result is on
* TAPE[T]. We need to do it this way so that we can freeze the final
* output tape while rewinding it. The last iteration of step D6 would
* be a waste of cycles anyway...
*/
state->result_tape = state->tp_tapenum[TAPERANGE];
LogicalTapeFreeze(state->tapeset, state->result_tape);
state->status = TSS_SORTEDONTAPE;
}
/*
* Merge one run from each input tape, except ones with dummy runs.
*
* This is the inner loop of Algorithm D step D5. We know that the
* output tape is TAPE[T].
*/
static void
mergeonerun(Tuplesortstate *state)
{
int destTape = state->tp_tapenum[TAPERANGE];
int srcTape;
unsigned int tuplen;
void *tup;
/*
* Start the merge by loading one tuple from each active source tape
* into the heap. We can also decrease the input run/dummy run counts.
*/
beginmerge(state);
/*
* Execute merge by repeatedly extracting lowest tuple in heap,
* writing it out, and replacing it with next tuple from same tape
* (if there is another one).
*/
while (state->heaptupcount > 0)
{
WRITETUP(state, destTape, state->heaptuples[0]);
srcTape = state->heapsrctapes[0];
tuplesort_heap_siftup(state);
if ((tuplen = getlen(state, srcTape, true)) != 0)
{
tup = READTUP(state, srcTape, tuplen);
tuplesort_heap_insert(state, tup, srcTape);
}
}
/*
* When the heap empties, we're done. Write an end-of-run marker
* on the output tape, and increment its count of real runs.
*/
markrunend(state, destTape);
state->tp_runs[TAPERANGE]++;
}
/*
* beginmerge - initialize for a merge pass
*
* We load the first tuple from each nondummy input run into the heap.
* We also decrease the counts of real and dummy runs for each tape.
*/
static void
beginmerge(Tuplesortstate *state)
{
int tapenum;
int srcTape;
unsigned int tuplen;
void *tup;
Assert(state->heaptuples != NULL && state->heaptupcount == 0);
if (state->heapsrctapes == NULL)
state->heapsrctapes = (int *) palloc(MAXTAPES * sizeof(int));
for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
{
if (state->tp_dummy[tapenum] > 0)
{
state->tp_dummy[tapenum]--;
}
else
{
Assert(state->tp_runs[tapenum] > 0);
state->tp_runs[tapenum]--;
srcTape = state->tp_tapenum[tapenum];
tuplen = getlen(state, srcTape, false);
tup = READTUP(state, srcTape, tuplen);
tuplesort_heap_insert(state, tup, srcTape);
}
}
}
/*
* beginrun - start a new initial run
*
* The tuples presently in the unsorted memory array are moved into
* the heap.
*/
static void
beginrun(Tuplesortstate *state)
{
int i;
Assert(state->heaptupcount == 0 && state->memtupcount > 0);
for (i = 0; i < state->memtupcount; i++)
tuplesort_heap_insert(state, state->memtuples[i], 0);
state->memtupcount = 0;
}
/*
* dumptuples - remove tuples from heap and write to tape
*
* When alltuples = false, dump only enough tuples to get under the
* availMem limit (and leave at least one tuple in the heap in any case,
* since puttuple assumes it always has a tuple to compare to).
*
* When alltuples = true, dump everything currently in memory.
* (This case is only used at end of input data.)
*
* If we empty the heap, then start a new run using the tuples that
* have accumulated in memtuples[] (if any).
*/
static void
dumptuples(Tuplesortstate *state, bool alltuples)
{
while (alltuples ||
(LACKMEM(state) &&
(state->heaptupcount > 0 || state->memtupcount > 0)))
{
/*
* Dump the heap's frontmost entry, and sift up to remove it
* from the heap.
*/
Assert(state->heaptupcount > 0);
WRITETUP(state, state->tp_tapenum[state->destTape],
state->heaptuples[0]);
tuplesort_heap_siftup(state);
/*
* If the heap is now empty, we've finished a run.
*/
if (state->heaptupcount == 0)
{
markrunend(state, state->tp_tapenum[state->destTape]);
state->tp_runs[state->destTape]++;
state->tp_dummy[state->destTape]--; /* per Alg D step D2 */
if (state->memtupcount == 0)
break; /* all input data has been written to tape */
/* Select new output tape and start a new run */
selectnewtape(state);
beginrun(state);
}
}
}
/*
* tuplesort_rescan - rewind and replay the scan
*/
void
tuplesort_rescan(Tuplesortstate *state)
{
Assert(state->randomAccess);
switch (state->status)
{
case TSS_SORTEDINMEM:
state->current = 0;
state->eof_reached = false;
state->markpos_offset = 0;
state->markpos_eof = false;
break;
case TSS_SORTEDONTAPE:
LogicalTapeRewind(state->tapeset,
state->result_tape,
false);
state->eof_reached = false;
state->markpos_block = 0L;
state->markpos_offset = 0;
state->markpos_eof = false;
break;
default:
elog(ERROR, "tuplesort_rescan: invalid state");
break;
}
}
/*
* tuplesort_markpos - saves current position in the merged sort file
*/
void
tuplesort_markpos(Tuplesortstate *state)
{
Assert(state->randomAccess);
switch (state->status)
{
case TSS_SORTEDINMEM:
state->markpos_offset = state->current;
state->markpos_eof = state->eof_reached;
break;
case TSS_SORTEDONTAPE:
LogicalTapeTell(state->tapeset,
state->result_tape,
& state->markpos_block,
& state->markpos_offset);
state->markpos_eof = state->eof_reached;
break;
default:
elog(ERROR, "tuplesort_markpos: invalid state");
break;
}
}
/*
* tuplesort_restorepos - restores current position in merged sort file to
* last saved position
*/
void
tuplesort_restorepos(Tuplesortstate *state)
{
Assert(state->randomAccess);
switch (state->status)
{
case TSS_SORTEDINMEM:
state->current = state->markpos_offset;
state->eof_reached = state->markpos_eof;
break;
case TSS_SORTEDONTAPE:
if (! LogicalTapeSeek(state->tapeset,
state->result_tape,
state->markpos_block,
state->markpos_offset))
elog(ERROR, "tuplesort_restorepos failed");
state->eof_reached = state->markpos_eof;
break;
default:
elog(ERROR, "tuplesort_restorepos: invalid state");
break;
}
}
/*
* Heap manipulation routines, per Knuth's Algorithm 5.2.3H.
*/
/*
* Insert a new tuple into an empty or existing heap, maintaining the
* heap invariant. The heap lives in state->heaptuples[]. Also, if
* state->heapsrctapes is not NULL, we store each tuple's source tapenum
* in the corresponding element of state->heapsrctapes[].
*/
static void
tuplesort_heap_insert(Tuplesortstate *state, void *tuple,
int tapenum)
{
int j;
/*
* Make sure heaptuples[] can handle another entry.
* NOTE: we do not enlarge heapsrctapes[]; it's supposed
* to be big enough when created.
*/
if (state->heaptupcount >= state->heaptupsize)
{
/* Grow the unsorted array as needed. */
state->heaptupsize *= 2;
state->heaptuples = (void **)
repalloc(state->heaptuples,
state->heaptupsize * sizeof(void *));
}
/*
* Sift-up the new entry, per Knuth 5.2.3 exercise 16.
* Note that Knuth is using 1-based array indexes, not 0-based.
*/
j = state->heaptupcount++;
while (j > 0) {
int i = (j-1) >> 1;
if (COMPARETUP(state, tuple, state->heaptuples[i]) >= 0)
break;
state->heaptuples[j] = state->heaptuples[i];
if (state->heapsrctapes)
state->heapsrctapes[j] = state->heapsrctapes[i];
j = i;
}
state->heaptuples[j] = tuple;
if (state->heapsrctapes)
state->heapsrctapes[j] = tapenum;
}
/*
* The tuple at state->heaptuples[0] has been removed from the heap.
* Decrement heaptupcount, and sift up to maintain the heap invariant.
*/
static void
tuplesort_heap_siftup(Tuplesortstate *state)
{
void **heaptuples = state->heaptuples;
void *tuple;
int i,
n;
if (--state->heaptupcount <= 0)
return;
n = state->heaptupcount;
tuple = heaptuples[n]; /* tuple that must be reinserted */
i = 0; /* i is where the "hole" is */
for (;;) {
int j = 2*i + 1;
if (j >= n)
break;
if (j+1 < n &&
COMPARETUP(state, heaptuples[j], heaptuples[j+1]) > 0)
j++;
if (COMPARETUP(state, tuple, heaptuples[j]) <= 0)
break;
heaptuples[i] = heaptuples[j];
if (state->heapsrctapes)
state->heapsrctapes[i] = state->heapsrctapes[j];
i = j;
}
heaptuples[i] = tuple;
if (state->heapsrctapes)
state->heapsrctapes[i] = state->heapsrctapes[n];
}
/*
* Tape interface routines
*/
static unsigned int
getlen(Tuplesortstate *state, int tapenum, bool eofOK)
{
unsigned int len;
if (LogicalTapeRead(state->tapeset, tapenum, (void *) &len,
sizeof(len)) != sizeof(len))
elog(ERROR, "tuplesort: unexpected end of tape");
if (len == 0 && !eofOK)
elog(ERROR, "tuplesort: unexpected end of data");
return len;
}
static void
markrunend(Tuplesortstate *state, int tapenum)
{
unsigned int len = 0;
LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len));
}
/*
* qsort interface
*/
static int
qsort_comparetup(const void *a, const void *b)
{
/* The passed pointers are pointers to void * ... */
return COMPARETUP(qsort_tuplesortstate, * (void **) a, * (void **) b);
}
/*
* Routines specialized for HeapTuple case
*/
static int
comparetup_heap(Tuplesortstate *state, const void *a, const void *b)
{
HeapTuple ltup = (HeapTuple) a;
HeapTuple rtup = (HeapTuple) b;
int nkey;
for (nkey = 0; nkey < state->nKeys; nkey++)
{
ScanKey scanKey = state->scanKeys + nkey;
Datum lattr,
rattr;
bool isnull1,
isnull2;
int result;
lattr = heap_getattr(ltup,
scanKey->sk_attno,
state->tupDesc,
&isnull1);
rattr = heap_getattr(rtup,
scanKey->sk_attno,
state->tupDesc,
&isnull2);
if (isnull1)
{
if (!isnull2)
return 1; /* NULL sorts after non-NULL */
}
else if (isnull2)
return -1;
else if (scanKey->sk_flags & SK_COMMUTE)
{
if (!(result = - (int) (*fmgr_faddr(&scanKey->sk_func)) (rattr, lattr)))
result = (int) (*fmgr_faddr(&scanKey->sk_func)) (lattr, rattr);
if (result)
return result;
}
else
{
if (!(result = - (int) (*fmgr_faddr(&scanKey->sk_func)) (lattr, rattr)))
result = (int) (*fmgr_faddr(&scanKey->sk_func)) (rattr, lattr);
if (result)
return result;
}
}
return 0;
}
static void *
copytup_heap(Tuplesortstate *state, void *tup)
{
HeapTuple tuple = (HeapTuple) tup;
USEMEM(state, HEAPTUPLESIZE + tuple->t_len);
return (void *) heap_copytuple(tuple);
}
/*
* We don't bother to write the HeapTupleData part of the tuple.
*/
static void
writetup_heap(Tuplesortstate *state, int tapenum, void *tup)
{
HeapTuple tuple = (HeapTuple) tup;
unsigned int tuplen;
tuplen = tuple->t_len + sizeof(tuplen);
LogicalTapeWrite(state->tapeset, tapenum,
(void*) &tuplen, sizeof(tuplen));
LogicalTapeWrite(state->tapeset, tapenum,
(void*) tuple->t_data, tuple->t_len);
if (state->randomAccess) /* need trailing length word? */
LogicalTapeWrite(state->tapeset, tapenum,
(void*) &tuplen, sizeof(tuplen));
FREEMEM(state, HEAPTUPLESIZE + tuple->t_len);
pfree(tuple);
}
static void *
readtup_heap(Tuplesortstate *state, int tapenum, unsigned int len)
{
unsigned int tuplen = len - sizeof(unsigned int) + HEAPTUPLESIZE;
HeapTuple tuple = (HeapTuple) palloc(tuplen);
USEMEM(state, tuplen);
/* reconstruct the HeapTupleData portion */
tuple->t_len = len - sizeof(unsigned int);
ItemPointerSetInvalid(&(tuple->t_self));
tuple->t_data = (HeapTupleHeader) (((char *) tuple) + HEAPTUPLESIZE);
/* read in the tuple proper */
if (LogicalTapeRead(state->tapeset, tapenum, (void *) tuple->t_data,
tuple->t_len) != tuple->t_len)
elog(ERROR, "tuplesort: unexpected end of data");
if (state->randomAccess) /* need trailing length word? */
if (LogicalTapeRead(state->tapeset, tapenum, (void *) &tuplen,
sizeof(tuplen)) != sizeof(tuplen))
elog(ERROR, "tuplesort: unexpected end of data");
return (void *) tuple;
}
/*
* Routines specialized for IndexTuple case
*
* NOTE: actually, these are specialized for the btree case; it's not
* clear whether you could use them for a non-btree index. Possibly
* you'd need to make another set of routines if you needed to sort
* according to another kind of index.
*/
static int
comparetup_index(Tuplesortstate *state, const void *a, const void *b)
{
IndexTuple ltup = (IndexTuple) a;
IndexTuple rtup = (IndexTuple) b;
TupleDesc itdesc = state->indexRel->rd_att;
bool equal_isnull = false;
Datum lattr,
rattr;
bool isnull1,
isnull2;
int i;
for (i = 0; i < itdesc->natts; i++)
{
lattr = index_getattr(ltup, i + 1, itdesc, &isnull1);
rattr = index_getattr(rtup, i + 1, itdesc, &isnull2);
if (isnull1)
{
if (!isnull2)
return 1; /* NULL sorts after non-NULL */
equal_isnull = true;
continue;
}
else if (isnull2)
return -1;
if (_bt_invokestrat(state->indexRel, i + 1,
BTGreaterStrategyNumber,
lattr, rattr))
return 1;
if (_bt_invokestrat(state->indexRel, i + 1,
BTGreaterStrategyNumber,
rattr, lattr))
return -1;
}
/*
* If btree has asked us to enforce uniqueness, complain if two equal
* tuples are detected (unless there was at least one NULL field).
*
* It is sufficient to make the test here, because if two tuples are
* equal they *must* get compared at some stage of the sort --- otherwise
* the sort algorithm wouldn't have checked whether one must appear
* before the other.
*/
if (state->enforceUnique && !equal_isnull)
elog(ERROR, "Cannot create unique index. Table contains non-unique values");
return 0;
}
static void *
copytup_index(Tuplesortstate *state, void *tup)
{
IndexTuple tuple = (IndexTuple) tup;
unsigned int tuplen = IndexTupleSize(tuple);
IndexTuple newtuple;
USEMEM(state, tuplen);
newtuple = (IndexTuple) palloc(tuplen);
memcpy(newtuple, tuple, tuplen);
return (void *) newtuple;
}
static void
writetup_index(Tuplesortstate *state, int tapenum, void *tup)
{
IndexTuple tuple = (IndexTuple) tup;
unsigned int tuplen;
tuplen = IndexTupleSize(tuple) + sizeof(tuplen);
LogicalTapeWrite(state->tapeset, tapenum,
(void*) &tuplen, sizeof(tuplen));
LogicalTapeWrite(state->tapeset, tapenum,
(void*) tuple, IndexTupleSize(tuple));
if (state->randomAccess) /* need trailing length word? */
LogicalTapeWrite(state->tapeset, tapenum,
(void*) &tuplen, sizeof(tuplen));
FREEMEM(state, IndexTupleSize(tuple));
pfree(tuple);
}
static void *
readtup_index(Tuplesortstate *state, int tapenum, unsigned int len)
{
unsigned int tuplen = len - sizeof(unsigned int);
IndexTuple tuple = (IndexTuple) palloc(tuplen);
USEMEM(state, tuplen);
if (LogicalTapeRead(state->tapeset, tapenum, (void *) tuple,
tuplen) != tuplen)
elog(ERROR, "tuplesort: unexpected end of data");
if (state->randomAccess) /* need trailing length word? */
if (LogicalTapeRead(state->tapeset, tapenum, (void *) &tuplen,
sizeof(tuplen)) != sizeof(tuplen))
elog(ERROR, "tuplesort: unexpected end of data");
return (void *) tuple;
}
......@@ -6,7 +6,7 @@
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: nbtree.h,v 1.31 1999/08/08 20:12:49 tgl Exp $
* $Id: nbtree.h,v 1.32 1999/10/17 22:15:03 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -288,9 +288,12 @@ extern BTItem _bt_formitem(IndexTuple itup);
/*
* prototypes for functions in nbtsort.c
*/
extern void *_bt_spoolinit(Relation index, int ntapes, bool isunique);
extern void _bt_spooldestroy(void *spool);
extern void _bt_spool(Relation index, BTItem btitem, void *spool);
extern void _bt_leafbuild(Relation index, void *spool);
typedef struct BTSpool BTSpool; /* opaque type known only within nbtsort.c */
extern BTSpool *_bt_spoolinit(Relation index, bool isunique);
extern void _bt_spooldestroy(BTSpool *btspool);
extern void _bt_spool(BTItem btitem, BTSpool *btspool);
extern void _bt_leafbuild(BTSpool *btspool);
#endif /* NBTREE_H */
......@@ -6,7 +6,7 @@
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: execnodes.h,v 1.36 1999/09/26 21:21:04 tgl Exp $
* $Id: execnodes.h,v 1.37 1999/10/17 22:15:07 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -597,17 +597,9 @@ typedef struct GroupState
/* ----------------
* SortState information
*
*| sort nodes are really just a kind of a scan since
*| we implement sorts by retrieving the entire subplan
*| into a temp relation, sorting the temp relation into
*| another sorted relation, and then preforming a simple
*| unqualified sequential scan on the sorted relation..
*| -cim 10/15/89
*
* Flag indicated whether relation has been sorted
* Keys scan key structures used to keep info on sort keys
* TempRelation temporary relation containing result of executing
* the subplan.
* sort_Done indicates whether sort has been performed yet
* sort_Keys scan key structures describing the sort keys
* tuplesortstate private state of tuplesort.c
*
* CommonScanState information
*
......@@ -628,9 +620,9 @@ typedef struct GroupState
typedef struct SortState
{
CommonScanState csstate; /* its first field is NodeTag */
bool sort_Flag;
bool sort_Done;
ScanKey sort_Keys;
bool cleaned;
void *tuplesortstate;
} SortState;
/* ----------------
......
......@@ -6,7 +6,7 @@
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: plannodes.h,v 1.30 1999/08/21 03:49:09 tgl Exp $
* $Id: plannodes.h,v 1.31 1999/10/17 22:15:07 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -288,8 +288,6 @@ typedef struct Sort
Oid nonameid;
int keycount;
SortState *sortstate;
void *psortstate;
bool cleaned;
} Sort;
/* ----------------
......
/*-------------------------------------------------------------------------
*
* tuplesort.h
* Generalized tuple sorting routines.
*
* This module handles sorting of either heap tuples or index tuples
* (and could fairly easily support other kinds of sortable objects,
* if necessary). It works efficiently for both small and large amounts
* of data. Small amounts are sorted in-memory using qsort(). Large
* amounts are sorted using temporary files and a standard external sort
* algorithm.
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: tuplesort.h,v 1.1 1999/10/17 22:15:09 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef TUPLESORT_H
#define TUPLESORT_H
#include "access/htup.h"
#include "access/itup.h"
#include "access/skey.h"
#include "access/tupdesc.h"
#include "utils/rel.h"
/* Tuplesortstate is an opaque type whose details are not known outside tuplesort.c. */
typedef struct Tuplesortstate Tuplesortstate;
/*
* We provide two different interfaces to what is essentially the same
* code: one for sorting HeapTuples and one for sorting IndexTuples.
* They differ primarily in the way that the sort key information is
* supplied.
*/
extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc,
int nkeys, ScanKey keys,
bool randomAccess);
extern Tuplesortstate *tuplesort_begin_index(Relation indexRel,
bool enforceUnique,
bool randomAccess);
extern void tuplesort_puttuple(Tuplesortstate *state, void *tuple);
extern void tuplesort_performsort(Tuplesortstate *state);
extern void *tuplesort_gettuple(Tuplesortstate *state, bool forward,
bool *should_free);
#define tuplesort_getheaptuple(state, forward, should_free) \
((HeapTuple) tuplesort_gettuple(state, forward, should_free))
#define tuplesort_getindextuple(state, forward, should_free) \
((IndexTuple) tuplesort_gettuple(state, forward, should_free))
extern void tuplesort_end(Tuplesortstate *state);
/*
* These routines may only be called if randomAccess was specified 'true'.
* Backwards scan in gettuple is likewise only allowed if randomAccess.
*/
extern void tuplesort_rescan(Tuplesortstate *state);
extern void tuplesort_markpos(Tuplesortstate *state);
extern void tuplesort_restorepos(Tuplesortstate *state);
#endif /* TUPLESORT_H */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment