Commit 1ebdec8c authored by Tom Lane's avatar Tom Lane

Rethink handling of pass-by-value leaf datums in SP-GiST.

The existing convention in SP-GiST is that any pass-by-value datatype
is stored in Datum representation, i.e. it's of width sizeof(Datum)
even when typlen is less than that.  This is okay, or at least it's
too late to change it, for prefix datums and node-label datums in inner
(upper) tuples.  But it's problematic for leaf datums, because we'd
prefer those to be stored in Postgres' standard on-disk representation
so that we can easily extend leaf tuples to carry additional "included"
columns.

I believe, however, that we can get away with just up and changing that.
This would be an unacceptable on-disk-format break, but there are two
big mitigating factors:

1. It seems quite unlikely that there are any SP-GiST opclasses out
there that use pass-by-value leaf datatypes.  Certainly none of the
ones in core do, nor has codesearch.debian.net heard of any.  Given
what SP-GiST is good for, it's hard to conceive of a use-case where
the leaf-level values would be both small and fixed-width.  (As an
example, if you wanted to index text values with the leaf level being
just a byte, then every text string would have to be represented
with one level of inner tuple per preceding byte, which would be
horrendously space-inefficient and slow to access.  You always want
to use as few inner-tuple levels as possible, leaving as much as
possible in the leaf values.)

2. Even granting that you have such an index, this change only
breaks things on big-endian machines.  On little-endian, the high
order bytes of the Datum format will now just appear to be alignment
padding space.

So, change the code to store pass-by-value leaf datums in their
usual on-disk form.  Inner-tuple datums are not touched.

This is extracted from a larger patch that intends to add support for
"included" columns.  I'm committing it separately for visibility in
our commit logs.

Pavel Borisov and Tom Lane, reviewed by Andrey Borodin

Discussion: https://postgr.es/m/CALT9ZEFi-vMp4faht9f9Junb1nO3NOSjhpxTmbm1UGLMsLqiEQ@mail.gmail.com
parent c9c41c7a
......@@ -731,13 +731,6 @@ doPickSplit(Relation index, SpGistState *state,
* also, count up the amount of space that will be freed from current.
* (Note that in the non-root case, we won't actually delete the old
* tuples, only replace them with redirects or placeholders.)
*
* Note: the SGLTDATUM calls here are safe even when dealing with a nulls
* page. For a pass-by-value data type we will fetch a word that must
* exist even though it may contain garbage (because of the fact that leaf
* tuples must have size at least SGDTSIZE). For a pass-by-reference type
* we are just computing a pointer that isn't going to get dereferenced.
* So it's not worth guarding the calls with isNulls checks.
*/
nToInsert = 0;
nToDelete = 0;
......@@ -757,7 +750,8 @@ doPickSplit(Relation index, SpGistState *state,
PageGetItemId(current->page, i));
if (it->tupstate == SPGIST_LIVE)
{
in.datums[nToInsert] = SGLTDATUM(it, state);
in.datums[nToInsert] =
isNulls ? (Datum) 0 : SGLTDATUM(it, state);
heapPtrs[nToInsert] = it->heapPtr;
nToInsert++;
toDelete[nToDelete] = i;
......@@ -782,7 +776,8 @@ doPickSplit(Relation index, SpGistState *state,
PageGetItemId(current->page, i));
if (it->tupstate == SPGIST_LIVE)
{
in.datums[nToInsert] = SGLTDATUM(it, state);
in.datums[nToInsert] =
isNulls ? (Datum) 0 : SGLTDATUM(it, state);
heapPtrs[nToInsert] = it->heapPtr;
nToInsert++;
toDelete[nToDelete] = i;
......@@ -814,7 +809,8 @@ doPickSplit(Relation index, SpGistState *state,
* space to include it; and in any case it has to be included in the input
* for the picksplit function. So don't increment nToInsert yet.
*/
in.datums[in.nTuples] = SGLTDATUM(newLeafTuple, state);
in.datums[in.nTuples] =
isNulls ? (Datum) 0 : SGLTDATUM(newLeafTuple, state);
heapPtrs[in.nTuples] = newLeafTuple->heapPtr;
in.nTuples++;
......@@ -1940,17 +1936,21 @@ spgdoinsert(Relation index, SpGistState *state,
leafDatum = (Datum) 0;
/*
* Compute space needed for a leaf tuple containing the given datum.
*
* If it isn't gonna fit, and the opclass can't reduce the datum size by
* suffixing, bail out now rather than getting into an endless loop.
* Compute space needed for a leaf tuple containing the given datum. This
* must match spgFormLeafTuple.
*/
leafSize = SGLTHDRSZ;
if (!isnull)
leafSize = SGLTHDRSZ + sizeof(ItemIdData) +
SpGistGetTypeSize(&state->attLeafType, leafDatum);
else
leafSize = SGDTSIZE + sizeof(ItemIdData);
leafSize += SpGistGetLeafTypeSize(&state->attLeafType, leafDatum);
if (leafSize < SGDTSIZE)
leafSize = SGDTSIZE;
/* Account for an item pointer, too */
leafSize += sizeof(ItemIdData);
/*
* If it isn't gonna fit, and the opclass can't reduce the datum size by
* suffixing, bail out now rather than getting into an endless loop.
*/
if (leafSize > SPGIST_PAGE_CAPACITY && !state->config.longValuesOK)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
......@@ -2161,8 +2161,9 @@ spgdoinsert(Relation index, SpGistState *state,
if (!isnull)
{
leafDatum = out.result.matchNode.restDatum;
leafSize = SGLTHDRSZ + sizeof(ItemIdData) +
SpGistGetTypeSize(&state->attLeafType, leafDatum);
leafSize = SGLTHDRSZ +
SpGistGetLeafTypeSize(&state->attLeafType, leafDatum);
leafSize += sizeof(ItemIdData);
}
/*
......
......@@ -603,13 +603,14 @@ spgoptions(Datum reloptions, bool validate)
}
/*
* Get the space needed to store a non-null datum of the indicated type.
* Get the space needed to store a non-null datum of the indicated type
* in an inner tuple (that is, as a prefix or node label).
* Note the result is already rounded up to a MAXALIGN boundary.
* Also, we follow the SPGiST convention that pass-by-val types are
* just stored in their Datum representation (compare memcpyDatum).
* Here we follow the convention that pass-by-val types are just stored
* in their Datum representation (compare memcpyInnerDatum).
*/
unsigned int
SpGistGetTypeSize(SpGistTypeDesc *att, Datum datum)
SpGistGetInnerTypeSize(SpGistTypeDesc *att, Datum datum)
{
unsigned int size;
......@@ -624,10 +625,28 @@ SpGistGetTypeSize(SpGistTypeDesc *att, Datum datum)
}
/*
* Copy the given non-null datum to *target
* Get the space needed to store a non-null datum of the indicated type
* in a leaf tuple. This is just the usual storage space for the type,
* but rounded up to a MAXALIGN boundary.
*/
unsigned int
SpGistGetLeafTypeSize(SpGistTypeDesc *att, Datum datum)
{
unsigned int size;
if (att->attlen > 0)
size = att->attlen;
else
size = VARSIZE_ANY(datum);
return MAXALIGN(size);
}
/*
* Copy the given non-null datum to *target, in the inner-tuple case
*/
static void
memcpyDatum(void *target, SpGistTypeDesc *att, Datum datum)
memcpyInnerDatum(void *target, SpGistTypeDesc *att, Datum datum)
{
unsigned int size;
......@@ -642,6 +661,25 @@ memcpyDatum(void *target, SpGistTypeDesc *att, Datum datum)
}
}
/*
* Copy the given non-null datum to *target, in the leaf-tuple case
*/
static void
memcpyLeafDatum(void *target, SpGistTypeDesc *att, Datum datum)
{
unsigned int size;
if (att->attbyval)
{
store_att_byval(target, datum, att->attlen);
}
else
{
size = (att->attlen > 0) ? att->attlen : VARSIZE_ANY(datum);
memcpy(target, DatumGetPointer(datum), size);
}
}
/*
* Construct a leaf tuple containing the given heap TID and datum value
*/
......@@ -655,7 +693,7 @@ spgFormLeafTuple(SpGistState *state, ItemPointer heapPtr,
/* compute space needed (note result is already maxaligned) */
size = SGLTHDRSZ;
if (!isnull)
size += SpGistGetTypeSize(&state->attLeafType, datum);
size += SpGistGetLeafTypeSize(&state->attLeafType, datum);
/*
* Ensure that we can replace the tuple with a dead tuple later. This
......@@ -671,7 +709,7 @@ spgFormLeafTuple(SpGistState *state, ItemPointer heapPtr,
tup->nextOffset = InvalidOffsetNumber;
tup->heapPtr = *heapPtr;
if (!isnull)
memcpyDatum(SGLTDATAPTR(tup), &state->attLeafType, datum);
memcpyLeafDatum(SGLTDATAPTR(tup), &state->attLeafType, datum);
return tup;
}
......@@ -692,7 +730,7 @@ spgFormNodeTuple(SpGistState *state, Datum label, bool isnull)
/* compute space needed (note result is already maxaligned) */
size = SGNTHDRSZ;
if (!isnull)
size += SpGistGetTypeSize(&state->attLabelType, label);
size += SpGistGetInnerTypeSize(&state->attLabelType, label);
/*
* Here we make sure that the size will fit in the field reserved for it
......@@ -716,7 +754,7 @@ spgFormNodeTuple(SpGistState *state, Datum label, bool isnull)
ItemPointerSetInvalid(&tup->t_tid);
if (!isnull)
memcpyDatum(SGNTDATAPTR(tup), &state->attLabelType, label);
memcpyInnerDatum(SGNTDATAPTR(tup), &state->attLabelType, label);
return tup;
}
......@@ -736,7 +774,7 @@ spgFormInnerTuple(SpGistState *state, bool hasPrefix, Datum prefix,
/* Compute size needed */
if (hasPrefix)
prefixSize = SpGistGetTypeSize(&state->attPrefixType, prefix);
prefixSize = SpGistGetInnerTypeSize(&state->attPrefixType, prefix);
else
prefixSize = 0;
......@@ -781,7 +819,7 @@ spgFormInnerTuple(SpGistState *state, bool hasPrefix, Datum prefix,
tup->size = size;
if (hasPrefix)
memcpyDatum(SGITDATAPTR(tup), &state->attPrefixType, prefix);
memcpyInnerDatum(SGITDATAPTR(tup), &state->attPrefixType, prefix);
ptr = (char *) SGITNODEPTR(tup);
......
......@@ -266,6 +266,14 @@ typedef struct SpGistCache
* header/optional prefix/array of nodes, which are SpGistNodeTuples
*
* size and prefixSize must be multiples of MAXALIGN
*
* If the prefix datum is of a pass-by-value type, it is stored in its
* Datum representation, that is its on-disk representation is of length
* sizeof(Datum). This is a fairly unfortunate choice, because in no other
* place does Postgres use Datum as an on-disk representation; it creates
* an unnecessary incompatibility between 32-bit and 64-bit builds. But the
* compatibility loss is mostly theoretical since MAXIMUM_ALIGNOF typically
* differs between such builds, too. Anyway we're stuck with it now.
*/
typedef struct SpGistInnerTupleData
{
......@@ -307,8 +315,7 @@ typedef SpGistInnerTupleData *SpGistInnerTuple;
* Node tuples use the same header as ordinary Postgres IndexTuples, but
* we do not use a null bitmap, because we know there is only one column
* so the INDEX_NULL_MASK bit suffices. Also, pass-by-value datums are
* stored as a full Datum, the same convention as for inner tuple prefixes
* and leaf tuple datums.
* stored in Datum form, the same convention as for inner tuple prefixes.
*/
typedef IndexTupleData SpGistNodeTupleData;
......@@ -322,12 +329,15 @@ typedef SpGistNodeTupleData *SpGistNodeTuple;
PointerGetDatum(SGNTDATAPTR(x)))
/*
* SPGiST leaf tuple: carries a datum and a heap tuple TID
* SPGiST leaf tuple: carries a leaf datum and a heap tuple TID
*
* In the simplest case, the datum is the same as the indexed value; but
* it could also be a suffix or some other sort of delta that permits
* In the simplest case, the leaf datum is the same as the indexed value;
* but it could also be a suffix or some other sort of delta that permits
* reconstruction given knowledge of the prefix path traversed to get here.
*
* If the leaf datum is NULL, it's not stored. This is not represented
* explicitly; we infer it from the tuple being stored on a nulls page.
*
* The size field is wider than could possibly be needed for an on-disk leaf
* tuple, but this allows us to form leaf tuples even when the datum is too
* wide to be stored immediately, and it costs nothing because of alignment
......@@ -339,13 +349,8 @@ typedef SpGistNodeTupleData *SpGistNodeTuple;
*
* size must be a multiple of MAXALIGN; also, it must be at least SGDTSIZE
* so that the tuple can be converted to REDIRECT status later. (This
* restriction only adds bytes for the null-datum case, otherwise alignment
* restrictions force it anyway.)
*
* In a leaf tuple for a NULL indexed value, there's no useful datum value;
* however, the SGDTSIZE limit ensures that's there's a Datum word there
* anyway, so SGLTDATUM can be applied safely as long as you don't do
* anything with the result.
* restriction only adds bytes for a NULL leaf datum stored on a 32-bit
* machine; otherwise alignment restrictions force it anyway.)
*/
typedef struct SpGistLeafTupleData
{
......@@ -353,16 +358,16 @@ typedef struct SpGistLeafTupleData
size:30; /* large enough for any palloc'able value */
OffsetNumber nextOffset; /* next tuple in chain, or InvalidOffsetNumber */
ItemPointerData heapPtr; /* TID of represented heap tuple */
/* leaf datum follows */
/* leaf datum follows on a MAXALIGN boundary */
} SpGistLeafTupleData;
typedef SpGistLeafTupleData *SpGistLeafTuple;
#define SGLTHDRSZ MAXALIGN(sizeof(SpGistLeafTupleData))
#define SGLTDATAPTR(x) (((char *) (x)) + SGLTHDRSZ)
#define SGLTDATUM(x, s) ((s)->attLeafType.attbyval ? \
*(Datum *) SGLTDATAPTR(x) : \
PointerGetDatum(SGLTDATAPTR(x)))
#define SGLTDATUM(x, s) fetch_att(SGLTDATAPTR(x), \
(s)->attLeafType.attbyval, \
(s)->attLeafType.attlen)
/*
* SPGiST dead tuple: declaration for examining non-live tuples
......@@ -455,7 +460,8 @@ extern void SpGistSetLastUsedPage(Relation index, Buffer buffer);
extern void SpGistInitPage(Page page, uint16 f);
extern void SpGistInitBuffer(Buffer b, uint16 f);
extern void SpGistInitMetapage(Page page);
extern unsigned int SpGistGetTypeSize(SpGistTypeDesc *att, Datum datum);
extern unsigned int SpGistGetInnerTypeSize(SpGistTypeDesc *att, Datum datum);
extern unsigned int SpGistGetLeafTypeSize(SpGistTypeDesc *att, Datum datum);
extern SpGistLeafTuple spgFormLeafTuple(SpGistState *state,
ItemPointer heapPtr,
Datum datum, bool isnull);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment