Commit 2a6ebe70 authored by Tom Lane's avatar Tom Lane

Fix contrib/seg's GiST picksplit method.

This patch replaces Guttman's generalized split method with a simple
sort-by-center-points algorithm.  Since the data is only one-dimensional
we don't really need the slow and none-too-stable Guttman method.

This is in part a bug fix, since seg has the same size_alpha versus
size_beta typo that was recently fixed in contrib/cube.  It seems
prudent to apply this rather aggressive fix only in HEAD, though.
Back branches will just get the typo fix.

Alexander Korotkov, reviewed by Yeb Havinga
parent 290f1603
...@@ -33,6 +33,16 @@ extern void seg_scanner_finish(void); ...@@ -33,6 +33,16 @@ extern void seg_scanner_finish(void);
extern int seg_yydebug; extern int seg_yydebug;
*/ */
/*
* Auxiliary data structure for picksplit method.
*/
typedef struct
{
float center;
OffsetNumber index;
SEG *data;
} gseg_picksplit_item;
/* /*
** Input/Output routines ** Input/Output routines
*/ */
...@@ -292,152 +302,104 @@ gseg_penalty(GISTENTRY *origentry, GISTENTRY *newentry, float *result) ...@@ -292,152 +302,104 @@ gseg_penalty(GISTENTRY *origentry, GISTENTRY *newentry, float *result)
return (result); return (result);
} }
/*
* Compare function for gseg_picksplit_item: sort by center.
*/
static int
gseg_picksplit_item_cmp(const void *a, const void *b)
{
const gseg_picksplit_item *i1 = (const gseg_picksplit_item *) a;
const gseg_picksplit_item *i2 = (const gseg_picksplit_item *) b;
if (i1->center < i2->center)
return -1;
else if (i1->center == i2->center)
return 0;
else
return 1;
}
/* /*
** The GiST PickSplit method for segments * The GiST PickSplit method for segments
** We use Guttman's poly time split algorithm *
*/ * We used to use Guttman's split algorithm here, but since the data is 1-D
* it's easier and more robust to just sort the segments by center-point and
* split at the middle.
*/
GIST_SPLITVEC * GIST_SPLITVEC *
gseg_picksplit(GistEntryVector *entryvec, gseg_picksplit(GistEntryVector *entryvec,
GIST_SPLITVEC *v) GIST_SPLITVEC *v)
{ {
OffsetNumber i, int i;
j;
SEG *datum_alpha,
*datum_beta;
SEG *datum_l, SEG *datum_l,
*datum_r; *datum_r,
SEG *union_d, *seg;
*union_dl, gseg_picksplit_item *sort_items;
*union_dr;
SEG *inter_d;
bool firsttime;
float size_alpha,
size_beta,
size_union,
size_inter;
float size_waste,
waste;
float size_l,
size_r;
int nbytes;
OffsetNumber seed_1 = 1,
seed_2 = 2;
OffsetNumber *left, OffsetNumber *left,
*right; *right;
OffsetNumber maxoff; OffsetNumber maxoff;
OffsetNumber firstright;
#ifdef GIST_DEBUG #ifdef GIST_DEBUG
fprintf(stderr, "picksplit\n"); fprintf(stderr, "picksplit\n");
#endif #endif
maxoff = entryvec->n - 2; /* Valid items in entryvec->vector[] are indexed 1..maxoff */
nbytes = (maxoff + 2) * sizeof(OffsetNumber); maxoff = entryvec->n - 1;
v->spl_left = (OffsetNumber *) palloc(nbytes);
v->spl_right = (OffsetNumber *) palloc(nbytes);
firsttime = true;
waste = 0.0;
for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i))
{
datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key);
for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j))
{
datum_beta = (SEG *) DatumGetPointer(entryvec->vector[j].key);
/* compute the wasted space by unioning these guys */
/* size_waste = size_union - size_inter; */
union_d = seg_union(datum_alpha, datum_beta);
rt_seg_size(union_d, &size_union);
inter_d = seg_inter(datum_alpha, datum_beta);
rt_seg_size(inter_d, &size_inter);
size_waste = size_union - size_inter;
/* /*
* are these a more promising split that what we've already seen? * Prepare the auxiliary array and sort it.
*/ */
if (size_waste > waste || firsttime) sort_items = (gseg_picksplit_item *)
palloc(maxoff * sizeof(gseg_picksplit_item));
for (i = 1; i <= maxoff; i++)
{ {
waste = size_waste; seg = (SEG *) DatumGetPointer(entryvec->vector[i].key);
seed_1 = i; /* center calculation is done this way to avoid possible overflow */
seed_2 = j; sort_items[i - 1].center = seg->lower*0.5f + seg->upper*0.5f;
firsttime = false; sort_items[i - 1].index = i;
} sort_items[i - 1].data = seg;
}
} }
qsort(sort_items, maxoff, sizeof(gseg_picksplit_item),
gseg_picksplit_item_cmp);
/* sort items below "firstright" will go into the left side */
firstright = maxoff / 2;
v->spl_left = (OffsetNumber *) palloc(maxoff * sizeof(OffsetNumber));
v->spl_right = (OffsetNumber *) palloc(maxoff * sizeof(OffsetNumber));
left = v->spl_left; left = v->spl_left;
v->spl_nleft = 0; v->spl_nleft = 0;
right = v->spl_right; right = v->spl_right;
v->spl_nright = 0; v->spl_nright = 0;
datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[seed_1].key);
datum_l = seg_union(datum_alpha, datum_alpha);
rt_seg_size(datum_l, &size_l);
datum_beta = (SEG *) DatumGetPointer(entryvec->vector[seed_2].key);
datum_r = seg_union(datum_beta, datum_beta);
rt_seg_size(datum_r, &size_r);
/* /*
* Now split up the regions between the two seeds. An important property * Emit segments to the left output page, and compute its bounding box.
* of this split algorithm is that the split vector v has the indices of
* items to be split in order in its left and right vectors. We exploit
* this property by doing a merge in the code that actually splits the
* page.
*
* For efficiency, we also place the new index tuple in this loop. This is
* handled at the very end, when we have placed all the existing tuples
* and i == maxoff + 1.
*/ */
datum_l = (SEG *) palloc(sizeof(SEG));
maxoff = OffsetNumberNext(maxoff); memcpy(datum_l, sort_items[0].data, sizeof(SEG));
for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) *left++ = sort_items[0].index;
{
/*
* If we've already decided where to place this item, just put it on
* the right list. Otherwise, we need to figure out which page needs
* the least enlargement in order to store the item.
*/
if (i == seed_1)
{
*left++ = i;
v->spl_nleft++; v->spl_nleft++;
continue; for (i = 1; i < firstright; i++)
}
else if (i == seed_2)
{ {
*right++ = i; datum_l = seg_union(datum_l, sort_items[i].data);
v->spl_nright++; *left++ = sort_items[i].index;
continue;
}
/* okay, which page needs least enlargement? */
datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key);
union_dl = seg_union(datum_l, datum_alpha);
union_dr = seg_union(datum_r, datum_alpha);
rt_seg_size(union_dl, &size_alpha);
rt_seg_size(union_dr, &size_beta);
/* pick which page to add it to */
if (size_alpha - size_l < size_beta - size_r)
{
datum_l = union_dl;
size_l = size_alpha;
*left++ = i;
v->spl_nleft++; v->spl_nleft++;
} }
else
/*
* Likewise for the right page.
*/
datum_r = (SEG *) palloc(sizeof(SEG));
memcpy(datum_r, sort_items[firstright].data, sizeof(SEG));
*right++ = sort_items[firstright].index;
v->spl_nright++;
for (i = firstright + 1; i < maxoff; i++)
{ {
datum_r = union_dr; datum_r = seg_union(datum_r, sort_items[i].data);
size_r = size_alpha; *right++ = sort_items[i].index;
*right++ = i;
v->spl_nright++; v->spl_nright++;
} }
}
*left = *right = FirstOffsetNumber; /* sentinel value, see dosplit() */
v->spl_ldatum = PointerGetDatum(datum_l); v->spl_ldatum = PointerGetDatum(datum_l);
v->spl_rdatum = PointerGetDatum(datum_r); v->spl_rdatum = PointerGetDatum(datum_r);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment