Commit def4c28c authored by Tom Lane's avatar Tom Lane

Change JSONB's on-disk format for improved performance.

The original design used an array of offsets into the variable-length
portion of a JSONB container.  However, such an array is basically
uncompressible by simple compression techniques such as TOAST's LZ
compressor.  That's bad enough, but because the offset array is at the
front, it tended to trigger the give-up-after-1KB heuristic in the TOAST
code, so that the entire JSONB object was stored uncompressed; which was
the root cause of bug #11109 from Larry White.

To fix without losing the ability to extract a random array element in O(1)
time, change this scheme so that most of the JEntry array elements hold
lengths rather than offsets.  With data that's compressible at all, there
tend to be fewer distinct element lengths, so that there is scope for
compression of the JEntry array.  Every N'th entry is still an offset.
To determine the length or offset of any specific element, we might have
to examine up to N preceding JEntrys, but that's still O(1) so far as the
total container size is concerned.  Testing shows that this cost is
negligible compared to other costs of accessing a JSONB field, and that
the method does largely fix the incompressible-data problem.

While at it, rearrange the order of elements in a JSONB object so that
it's "all the keys, then all the values" not alternating keys and values.
This doesn't really make much difference right at the moment, but it will
allow providing a fast path for extracting individual object fields from
large JSONB values stored EXTERNAL (ie, uncompressed), analogously to the
existing optimization for substring extraction from large EXTERNAL text
values.

Bump catversion to denote the incompatibility in on-disk format.
We will need to fix pg_upgrade to disallow upgrading jsonb data stored
with 9.4 betas 1 and 2.

Heikki Linnakangas and Tom Lane
parent ff27fcfa
...@@ -196,12 +196,12 @@ jsonb_from_cstring(char *json, int len) ...@@ -196,12 +196,12 @@ jsonb_from_cstring(char *json, int len)
static size_t static size_t
checkStringLen(size_t len) checkStringLen(size_t len)
{ {
if (len > JENTRY_POSMASK) if (len > JENTRY_OFFLENMASK)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("string too long to represent as jsonb string"), errmsg("string too long to represent as jsonb string"),
errdetail("Due to an implementation restriction, jsonb strings cannot exceed %d bytes.", errdetail("Due to an implementation restriction, jsonb strings cannot exceed %d bytes.",
JENTRY_POSMASK))); JENTRY_OFFLENMASK)));
return len; return len;
} }
......
...@@ -26,13 +26,14 @@ ...@@ -26,13 +26,14 @@
* in MaxAllocSize, and the number of elements (or pairs) must fit in the bits * in MaxAllocSize, and the number of elements (or pairs) must fit in the bits
* reserved for that in the JsonbContainer.header field. * reserved for that in the JsonbContainer.header field.
* *
* (the total size of an array's elements is also limited by JENTRY_POSMASK, * (The total size of an array's or object's elements is also limited by
* but we're not concerned about that here) * JENTRY_OFFLENMASK, but we're not concerned about that here.)
*/ */
#define JSONB_MAX_ELEMS (Min(MaxAllocSize / sizeof(JsonbValue), JB_CMASK)) #define JSONB_MAX_ELEMS (Min(MaxAllocSize / sizeof(JsonbValue), JB_CMASK))
#define JSONB_MAX_PAIRS (Min(MaxAllocSize / sizeof(JsonbPair), JB_CMASK)) #define JSONB_MAX_PAIRS (Min(MaxAllocSize / sizeof(JsonbPair), JB_CMASK))
static void fillJsonbValue(JEntry *array, int index, char *base_addr, static void fillJsonbValue(JsonbContainer *container, int index,
char *base_addr, uint32 offset,
JsonbValue *result); JsonbValue *result);
static bool equalsJsonbScalarValue(JsonbValue *a, JsonbValue *b); static bool equalsJsonbScalarValue(JsonbValue *a, JsonbValue *b);
static int compareJsonbScalarValue(JsonbValue *a, JsonbValue *b); static int compareJsonbScalarValue(JsonbValue *a, JsonbValue *b);
...@@ -107,6 +108,58 @@ JsonbValueToJsonb(JsonbValue *val) ...@@ -107,6 +108,58 @@ JsonbValueToJsonb(JsonbValue *val)
return out; return out;
} }
/*
* Get the offset of the variable-length portion of a Jsonb node within
* the variable-length-data part of its container. The node is identified
* by index within the container's JEntry array.
*/
uint32
getJsonbOffset(const JsonbContainer *jc, int index)
{
uint32 offset = 0;
int i;
/*
* Start offset of this entry is equal to the end offset of the previous
* entry. Walk backwards to the most recent entry stored as an end
* offset, returning that offset plus any lengths in between.
*/
for (i = index - 1; i >= 0; i--)
{
offset += JBE_OFFLENFLD(jc->children[i]);
if (JBE_HAS_OFF(jc->children[i]))
break;
}
return offset;
}
/*
* Get the length of the variable-length portion of a Jsonb node.
* The node is identified by index within the container's JEntry array.
*/
uint32
getJsonbLength(const JsonbContainer *jc, int index)
{
uint32 off;
uint32 len;
/*
* If the length is stored directly in the JEntry, just return it.
* Otherwise, get the begin offset of the entry, and subtract that from
* the stored end+1 offset.
*/
if (JBE_HAS_OFF(jc->children[index]))
{
off = getJsonbOffset(jc, index);
len = JBE_OFFLENFLD(jc->children[index]) - off;
}
else
len = JBE_OFFLENFLD(jc->children[index]);
return len;
}
/* /*
* BT comparator worker function. Returns an integer less than, equal to, or * BT comparator worker function. Returns an integer less than, equal to, or
* greater than zero, indicating whether a is less than, equal to, or greater * greater than zero, indicating whether a is less than, equal to, or greater
...@@ -272,24 +325,33 @@ findJsonbValueFromContainer(JsonbContainer *container, uint32 flags, ...@@ -272,24 +325,33 @@ findJsonbValueFromContainer(JsonbContainer *container, uint32 flags,
{ {
JEntry *children = container->children; JEntry *children = container->children;
int count = (container->header & JB_CMASK); int count = (container->header & JB_CMASK);
JsonbValue *result = palloc(sizeof(JsonbValue)); JsonbValue *result;
Assert((flags & ~(JB_FARRAY | JB_FOBJECT)) == 0); Assert((flags & ~(JB_FARRAY | JB_FOBJECT)) == 0);
/* Quick out without a palloc cycle if object/array is empty */
if (count <= 0)
return NULL;
result = palloc(sizeof(JsonbValue));
if (flags & JB_FARRAY & container->header) if (flags & JB_FARRAY & container->header)
{ {
char *base_addr = (char *) (children + count); char *base_addr = (char *) (children + count);
uint32 offset = 0;
int i; int i;
for (i = 0; i < count; i++) for (i = 0; i < count; i++)
{ {
fillJsonbValue(children, i, base_addr, result); fillJsonbValue(container, i, base_addr, offset, result);
if (key->type == result->type) if (key->type == result->type)
{ {
if (equalsJsonbScalarValue(key, result)) if (equalsJsonbScalarValue(key, result))
return result; return result;
} }
JBE_ADVANCE_OFFSET(offset, children[i]);
} }
} }
else if (flags & JB_FOBJECT & container->header) else if (flags & JB_FOBJECT & container->header)
...@@ -297,36 +359,35 @@ findJsonbValueFromContainer(JsonbContainer *container, uint32 flags, ...@@ -297,36 +359,35 @@ findJsonbValueFromContainer(JsonbContainer *container, uint32 flags,
/* Since this is an object, account for *Pairs* of Jentrys */ /* Since this is an object, account for *Pairs* of Jentrys */
char *base_addr = (char *) (children + count * 2); char *base_addr = (char *) (children + count * 2);
uint32 stopLow = 0, uint32 stopLow = 0,
stopMiddle; stopHigh = count;
/* Object key past by caller must be a string */ /* Object key passed by caller must be a string */
Assert(key->type == jbvString); Assert(key->type == jbvString);
/* Binary search on object/pair keys *only* */ /* Binary search on object/pair keys *only* */
while (stopLow < count) while (stopLow < stopHigh)
{ {
int index; uint32 stopMiddle;
int difference; int difference;
JsonbValue candidate; JsonbValue candidate;
/* stopMiddle = stopLow + (stopHigh - stopLow) / 2;
* Note how we compensate for the fact that we're iterating
* through pairs (not entries) throughout.
*/
stopMiddle = stopLow + (count - stopLow) / 2;
index = stopMiddle * 2;
candidate.type = jbvString; candidate.type = jbvString;
candidate.val.string.val = base_addr + JBE_OFF(children, index); candidate.val.string.val =
candidate.val.string.len = JBE_LEN(children, index); base_addr + getJsonbOffset(container, stopMiddle);
candidate.val.string.len = getJsonbLength(container, stopMiddle);
difference = lengthCompareJsonbStringValue(&candidate, key); difference = lengthCompareJsonbStringValue(&candidate, key);
if (difference == 0) if (difference == 0)
{ {
/* Found our key, return value */ /* Found our key, return corresponding value */
fillJsonbValue(children, index + 1, base_addr, result); int index = stopMiddle + count;
fillJsonbValue(container, index, base_addr,
getJsonbOffset(container, index),
result);
return result; return result;
} }
...@@ -335,7 +396,7 @@ findJsonbValueFromContainer(JsonbContainer *container, uint32 flags, ...@@ -335,7 +396,7 @@ findJsonbValueFromContainer(JsonbContainer *container, uint32 flags,
if (difference < 0) if (difference < 0)
stopLow = stopMiddle + 1; stopLow = stopMiddle + 1;
else else
count = stopMiddle; stopHigh = stopMiddle;
} }
} }
} }
...@@ -368,7 +429,9 @@ getIthJsonbValueFromContainer(JsonbContainer *container, uint32 i) ...@@ -368,7 +429,9 @@ getIthJsonbValueFromContainer(JsonbContainer *container, uint32 i)
result = palloc(sizeof(JsonbValue)); result = palloc(sizeof(JsonbValue));
fillJsonbValue(container->children, i, base_addr, result); fillJsonbValue(container, i, base_addr,
getJsonbOffset(container, i),
result);
return result; return result;
} }
...@@ -377,13 +440,20 @@ getIthJsonbValueFromContainer(JsonbContainer *container, uint32 i) ...@@ -377,13 +440,20 @@ getIthJsonbValueFromContainer(JsonbContainer *container, uint32 i)
* A helper function to fill in a JsonbValue to represent an element of an * A helper function to fill in a JsonbValue to represent an element of an
* array, or a key or value of an object. * array, or a key or value of an object.
* *
* The node's JEntry is at container->children[index], and its variable-length
* data is at base_addr + offset. We make the caller determine the offset
* since in many cases the caller can amortize that work across multiple
* children. When it can't, it can just call getJsonbOffset().
*
* A nested array or object will be returned as jbvBinary, ie. it won't be * A nested array or object will be returned as jbvBinary, ie. it won't be
* expanded. * expanded.
*/ */
static void static void
fillJsonbValue(JEntry *children, int index, char *base_addr, JsonbValue *result) fillJsonbValue(JsonbContainer *container, int index,
char *base_addr, uint32 offset,
JsonbValue *result)
{ {
JEntry entry = children[index]; JEntry entry = container->children[index];
if (JBE_ISNULL(entry)) if (JBE_ISNULL(entry))
{ {
...@@ -392,14 +462,14 @@ fillJsonbValue(JEntry *children, int index, char *base_addr, JsonbValue *result) ...@@ -392,14 +462,14 @@ fillJsonbValue(JEntry *children, int index, char *base_addr, JsonbValue *result)
else if (JBE_ISSTRING(entry)) else if (JBE_ISSTRING(entry))
{ {
result->type = jbvString; result->type = jbvString;
result->val.string.val = base_addr + JBE_OFF(children, index); result->val.string.val = base_addr + offset;
result->val.string.len = JBE_LEN(children, index); result->val.string.len = getJsonbLength(container, index);
Assert(result->val.string.len >= 0); Assert(result->val.string.len >= 0);
} }
else if (JBE_ISNUMERIC(entry)) else if (JBE_ISNUMERIC(entry))
{ {
result->type = jbvNumeric; result->type = jbvNumeric;
result->val.numeric = (Numeric) (base_addr + INTALIGN(JBE_OFF(children, index))); result->val.numeric = (Numeric) (base_addr + INTALIGN(offset));
} }
else if (JBE_ISBOOL_TRUE(entry)) else if (JBE_ISBOOL_TRUE(entry))
{ {
...@@ -415,8 +485,10 @@ fillJsonbValue(JEntry *children, int index, char *base_addr, JsonbValue *result) ...@@ -415,8 +485,10 @@ fillJsonbValue(JEntry *children, int index, char *base_addr, JsonbValue *result)
{ {
Assert(JBE_ISCONTAINER(entry)); Assert(JBE_ISCONTAINER(entry));
result->type = jbvBinary; result->type = jbvBinary;
result->val.binary.data = (JsonbContainer *) (base_addr + INTALIGN(JBE_OFF(children, index))); /* Remove alignment padding from data pointer and length */
result->val.binary.len = JBE_LEN(children, index) - (INTALIGN(JBE_OFF(children, index)) - JBE_OFF(children, index)); result->val.binary.data = (JsonbContainer *) (base_addr + INTALIGN(offset));
result->val.binary.len = getJsonbLength(container, index) -
(INTALIGN(offset) - offset);
} }
} }
...@@ -668,13 +740,15 @@ recurse: ...@@ -668,13 +740,15 @@ recurse:
* a full conversion * a full conversion
*/ */
val->val.array.rawScalar = (*it)->isScalar; val->val.array.rawScalar = (*it)->isScalar;
(*it)->i = 0; (*it)->curIndex = 0;
(*it)->curDataOffset = 0;
(*it)->curValueOffset = 0; /* not actually used */
/* Set state for next call */ /* Set state for next call */
(*it)->state = JBI_ARRAY_ELEM; (*it)->state = JBI_ARRAY_ELEM;
return WJB_BEGIN_ARRAY; return WJB_BEGIN_ARRAY;
case JBI_ARRAY_ELEM: case JBI_ARRAY_ELEM:
if ((*it)->i >= (*it)->nElems) if ((*it)->curIndex >= (*it)->nElems)
{ {
/* /*
* All elements within array already processed. Report this * All elements within array already processed. Report this
...@@ -686,7 +760,13 @@ recurse: ...@@ -686,7 +760,13 @@ recurse:
return WJB_END_ARRAY; return WJB_END_ARRAY;
} }
fillJsonbValue((*it)->children, (*it)->i++, (*it)->dataProper, val); fillJsonbValue((*it)->container, (*it)->curIndex,
(*it)->dataProper, (*it)->curDataOffset,
val);
JBE_ADVANCE_OFFSET((*it)->curDataOffset,
(*it)->children[(*it)->curIndex]);
(*it)->curIndex++;
if (!IsAJsonbScalar(val) && !skipNested) if (!IsAJsonbScalar(val) && !skipNested)
{ {
...@@ -697,8 +777,8 @@ recurse: ...@@ -697,8 +777,8 @@ recurse:
else else
{ {
/* /*
* Scalar item in array, or a container and caller didn't * Scalar item in array, or a container and caller didn't want
* want us to recurse into it. * us to recurse into it.
*/ */
return WJB_ELEM; return WJB_ELEM;
} }
...@@ -712,13 +792,16 @@ recurse: ...@@ -712,13 +792,16 @@ recurse:
* v->val.object.pairs is not actually set, because we aren't * v->val.object.pairs is not actually set, because we aren't
* doing a full conversion * doing a full conversion
*/ */
(*it)->i = 0; (*it)->curIndex = 0;
(*it)->curDataOffset = 0;
(*it)->curValueOffset = getJsonbOffset((*it)->container,
(*it)->nElems);
/* Set state for next call */ /* Set state for next call */
(*it)->state = JBI_OBJECT_KEY; (*it)->state = JBI_OBJECT_KEY;
return WJB_BEGIN_OBJECT; return WJB_BEGIN_OBJECT;
case JBI_OBJECT_KEY: case JBI_OBJECT_KEY:
if ((*it)->i >= (*it)->nElems) if ((*it)->curIndex >= (*it)->nElems)
{ {
/* /*
* All pairs within object already processed. Report this to * All pairs within object already processed. Report this to
...@@ -732,7 +815,9 @@ recurse: ...@@ -732,7 +815,9 @@ recurse:
else else
{ {
/* Return key of a key/value pair. */ /* Return key of a key/value pair. */
fillJsonbValue((*it)->children, (*it)->i * 2, (*it)->dataProper, val); fillJsonbValue((*it)->container, (*it)->curIndex,
(*it)->dataProper, (*it)->curDataOffset,
val);
if (val->type != jbvString) if (val->type != jbvString)
elog(ERROR, "unexpected jsonb type as object key"); elog(ERROR, "unexpected jsonb type as object key");
...@@ -745,8 +830,15 @@ recurse: ...@@ -745,8 +830,15 @@ recurse:
/* Set state for next call */ /* Set state for next call */
(*it)->state = JBI_OBJECT_KEY; (*it)->state = JBI_OBJECT_KEY;
fillJsonbValue((*it)->children, ((*it)->i++) * 2 + 1, fillJsonbValue((*it)->container, (*it)->curIndex + (*it)->nElems,
(*it)->dataProper, val); (*it)->dataProper, (*it)->curValueOffset,
val);
JBE_ADVANCE_OFFSET((*it)->curDataOffset,
(*it)->children[(*it)->curIndex]);
JBE_ADVANCE_OFFSET((*it)->curValueOffset,
(*it)->children[(*it)->curIndex + (*it)->nElems]);
(*it)->curIndex++;
/* /*
* Value may be a container, in which case we recurse with new, * Value may be a container, in which case we recurse with new,
...@@ -795,11 +887,6 @@ iteratorFromContainer(JsonbContainer *container, JsonbIterator *parent) ...@@ -795,11 +887,6 @@ iteratorFromContainer(JsonbContainer *container, JsonbIterator *parent)
break; break;
case JB_FOBJECT: case JB_FOBJECT:
/*
* Offset reflects that nElems indicates JsonbPairs in an object.
* Each key and each value contain Jentry metadata just the same.
*/
it->dataProper = it->dataProper =
(char *) it->children + it->nElems * sizeof(JEntry) * 2; (char *) it->children + it->nElems * sizeof(JEntry) * 2;
it->state = JBI_OBJECT_START; it->state = JBI_OBJECT_START;
...@@ -1209,8 +1296,8 @@ reserveFromBuffer(StringInfo buffer, int len) ...@@ -1209,8 +1296,8 @@ reserveFromBuffer(StringInfo buffer, int len)
buffer->len += len; buffer->len += len;
/* /*
* Keep a trailing null in place, even though it's not useful for us; * Keep a trailing null in place, even though it's not useful for us; it
* it seems best to preserve the invariants of StringInfos. * seems best to preserve the invariants of StringInfos.
*/ */
buffer->data[buffer->len] = '\0'; buffer->data[buffer->len] = '\0';
...@@ -1284,8 +1371,8 @@ convertToJsonb(JsonbValue *val) ...@@ -1284,8 +1371,8 @@ convertToJsonb(JsonbValue *val)
/* /*
* Note: the JEntry of the root is discarded. Therefore the root * Note: the JEntry of the root is discarded. Therefore the root
* JsonbContainer struct must contain enough information to tell what * JsonbContainer struct must contain enough information to tell what kind
* kind of value it is. * of value it is.
*/ */
res = (Jsonb *) buffer.data; res = (Jsonb *) buffer.data;
...@@ -1299,9 +1386,9 @@ convertToJsonb(JsonbValue *val) ...@@ -1299,9 +1386,9 @@ convertToJsonb(JsonbValue *val)
* Subroutine of convertJsonb: serialize a single JsonbValue into buffer. * Subroutine of convertJsonb: serialize a single JsonbValue into buffer.
* *
* The JEntry header for this node is returned in *header. It is filled in * The JEntry header for this node is returned in *header. It is filled in
* with the length of this value, but if it is stored in an array or an * with the length of this value and appropriate type bits. If we wish to
* object (which is always, except for the root node), it is the caller's * store an end offset rather than a length, it is the caller's responsibility
* responsibility to adjust it with the offset within the container. * to adjust for that.
* *
* If the value is an array or an object, this recurses. 'level' is only used * If the value is an array or an object, this recurses. 'level' is only used
* for debugging purposes. * for debugging purposes.
...@@ -1315,10 +1402,10 @@ convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level) ...@@ -1315,10 +1402,10 @@ convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level)
return; return;
/* /*
* A JsonbValue passed as val should never have a type of jbvBinary, * A JsonbValue passed as val should never have a type of jbvBinary, and
* and neither should any of its sub-components. Those values will be * neither should any of its sub-components. Those values will be produced
* produced by convertJsonbArray and convertJsonbObject, the results of * by convertJsonbArray and convertJsonbObject, the results of which will
* which will not be passed back to this function as an argument. * not be passed back to this function as an argument.
*/ */
if (IsAJsonbScalar(val)) if (IsAJsonbScalar(val))
...@@ -1334,124 +1421,200 @@ convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level) ...@@ -1334,124 +1421,200 @@ convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level)
static void static void
convertJsonbArray(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level) convertJsonbArray(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level)
{ {
int offset; int base_offset;
int metaoffset; int jentry_offset;
int i; int i;
int totallen; int totallen;
uint32 header; uint32 header;
int nElems = val->val.array.nElems;
/* Initialize pointer into conversion buffer at this level */ /* Remember where in the buffer this array starts. */
offset = buffer->len; base_offset = buffer->len;
/* Align to 4-byte boundary (any padding counts as part of my data) */
padBufferToInt(buffer); padBufferToInt(buffer);
/* /*
* Construct the header Jentry, stored in the beginning of the variable- * Construct the header Jentry and store it in the beginning of the
* length payload. * variable-length payload.
*/ */
header = val->val.array.nElems | JB_FARRAY; header = nElems | JB_FARRAY;
if (val->val.array.rawScalar) if (val->val.array.rawScalar)
{ {
Assert(val->val.array.nElems == 1); Assert(nElems == 1);
Assert(level == 0); Assert(level == 0);
header |= JB_FSCALAR; header |= JB_FSCALAR;
} }
appendToBuffer(buffer, (char *) &header, sizeof(uint32)); appendToBuffer(buffer, (char *) &header, sizeof(uint32));
/* reserve space for the JEntries of the elements. */
metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.array.nElems); /* Reserve space for the JEntries of the elements. */
jentry_offset = reserveFromBuffer(buffer, sizeof(JEntry) * nElems);
totallen = 0; totallen = 0;
for (i = 0; i < val->val.array.nElems; i++) for (i = 0; i < nElems; i++)
{ {
JsonbValue *elem = &val->val.array.elems[i]; JsonbValue *elem = &val->val.array.elems[i];
int len; int len;
JEntry meta; JEntry meta;
/*
* Convert element, producing a JEntry and appending its
* variable-length data to buffer
*/
convertJsonbValue(buffer, &meta, elem, level + 1); convertJsonbValue(buffer, &meta, elem, level + 1);
len = meta & JENTRY_POSMASK;
len = JBE_OFFLENFLD(meta);
totallen += len; totallen += len;
if (totallen > JENTRY_POSMASK) /*
* Bail out if total variable-length data exceeds what will fit in a
* JEntry length field. We check this in each iteration, not just
* once at the end, to forestall possible integer overflow.
*/
if (totallen > JENTRY_OFFLENMASK)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("total size of jsonb array elements exceeds the maximum of %u bytes", errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
JENTRY_POSMASK))); JENTRY_OFFLENMASK)));
/*
* Convert each JB_OFFSET_STRIDE'th length to an offset.
*/
if ((i % JB_OFFSET_STRIDE) == 0)
meta = (meta & JENTRY_TYPEMASK) | totallen | JENTRY_HAS_OFF;
if (i > 0) copyToBuffer(buffer, jentry_offset, (char *) &meta, sizeof(JEntry));
meta = (meta & ~JENTRY_POSMASK) | totallen; jentry_offset += sizeof(JEntry);
copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry));
metaoffset += sizeof(JEntry);
} }
totallen = buffer->len - offset; /* Total data size is everything we've appended to buffer */
totallen = buffer->len - base_offset;
/* Check length again, since we didn't include the metadata above */
if (totallen > JENTRY_OFFLENMASK)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("total size of jsonb array elements exceeds the maximum of %u bytes",
JENTRY_OFFLENMASK)));
/* Initialize the header of this node, in the container's JEntry array */ /* Initialize the header of this node in the container's JEntry array */
*pheader = JENTRY_ISCONTAINER | totallen; *pheader = JENTRY_ISCONTAINER | totallen;
} }
static void static void
convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level) convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level)
{ {
uint32 header; int base_offset;
int offset; int jentry_offset;
int metaoffset;
int i; int i;
int totallen; int totallen;
uint32 header;
int nPairs = val->val.object.nPairs;
/* Initialize pointer into conversion buffer at this level */ /* Remember where in the buffer this object starts. */
offset = buffer->len; base_offset = buffer->len;
/* Align to 4-byte boundary (any padding counts as part of my data) */
padBufferToInt(buffer); padBufferToInt(buffer);
/* Initialize header */ /*
header = val->val.object.nPairs | JB_FOBJECT; * Construct the header Jentry and store it in the beginning of the
* variable-length payload.
*/
header = nPairs | JB_FOBJECT;
appendToBuffer(buffer, (char *) &header, sizeof(uint32)); appendToBuffer(buffer, (char *) &header, sizeof(uint32));
/* reserve space for the JEntries of the keys and values */ /* Reserve space for the JEntries of the keys and values. */
metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.object.nPairs * 2); jentry_offset = reserveFromBuffer(buffer, sizeof(JEntry) * nPairs * 2);
/*
* Iterate over the keys, then over the values, since that is the ordering
* we want in the on-disk representation.
*/
totallen = 0; totallen = 0;
for (i = 0; i < val->val.object.nPairs; i++) for (i = 0; i < nPairs; i++)
{ {
JsonbPair *pair = &val->val.object.pairs[i]; JsonbPair *pair = &val->val.object.pairs[i];
int len; int len;
JEntry meta; JEntry meta;
/* put key */ /*
* Convert key, producing a JEntry and appending its variable-length
* data to buffer
*/
convertJsonbScalar(buffer, &meta, &pair->key); convertJsonbScalar(buffer, &meta, &pair->key);
len = meta & JENTRY_POSMASK; len = JBE_OFFLENFLD(meta);
totallen += len; totallen += len;
if (totallen > JENTRY_POSMASK) /*
* Bail out if total variable-length data exceeds what will fit in a
* JEntry length field. We check this in each iteration, not just
* once at the end, to forestall possible integer overflow.
*/
if (totallen > JENTRY_OFFLENMASK)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("total size of jsonb array elements exceeds the maximum of %u bytes", errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
JENTRY_POSMASK))); JENTRY_OFFLENMASK)));
/*
* Convert each JB_OFFSET_STRIDE'th length to an offset.
*/
if ((i % JB_OFFSET_STRIDE) == 0)
meta = (meta & JENTRY_TYPEMASK) | totallen | JENTRY_HAS_OFF;
copyToBuffer(buffer, jentry_offset, (char *) &meta, sizeof(JEntry));
jentry_offset += sizeof(JEntry);
}
for (i = 0; i < nPairs; i++)
{
JsonbPair *pair = &val->val.object.pairs[i];
int len;
JEntry meta;
if (i > 0) /*
meta = (meta & ~JENTRY_POSMASK) | totallen; * Convert value, producing a JEntry and appending its variable-length
copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry)); * data to buffer
metaoffset += sizeof(JEntry); */
convertJsonbValue(buffer, &meta, &pair->value, level + 1);
convertJsonbValue(buffer, &meta, &pair->value, level); len = JBE_OFFLENFLD(meta);
len = meta & JENTRY_POSMASK;
totallen += len; totallen += len;
if (totallen > JENTRY_POSMASK) /*
* Bail out if total variable-length data exceeds what will fit in a
* JEntry length field. We check this in each iteration, not just
* once at the end, to forestall possible integer overflow.
*/
if (totallen > JENTRY_OFFLENMASK)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("total size of jsonb array elements exceeds the maximum of %u bytes", errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
JENTRY_POSMASK))); JENTRY_OFFLENMASK)));
meta = (meta & ~JENTRY_POSMASK) | totallen; /*
copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry)); * Convert each JB_OFFSET_STRIDE'th length to an offset.
metaoffset += sizeof(JEntry); */
if (((i + nPairs) % JB_OFFSET_STRIDE) == 0)
meta = (meta & JENTRY_TYPEMASK) | totallen | JENTRY_HAS_OFF;
copyToBuffer(buffer, jentry_offset, (char *) &meta, sizeof(JEntry));
jentry_offset += sizeof(JEntry);
} }
totallen = buffer->len - offset; /* Total data size is everything we've appended to buffer */
totallen = buffer->len - base_offset;
/* Check length again, since we didn't include the metadata above */
if (totallen > JENTRY_OFFLENMASK)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("total size of jsonb object elements exceeds the maximum of %u bytes",
JENTRY_OFFLENMASK)));
/* Initialize the header of this node in the container's JEntry array */
*pheader = JENTRY_ISCONTAINER | totallen; *pheader = JENTRY_ISCONTAINER | totallen;
} }
......
...@@ -53,6 +53,6 @@ ...@@ -53,6 +53,6 @@
*/ */
/* yyyymmddN */ /* yyyymmddN */
#define CATALOG_VERSION_NO 201409241 #define CATALOG_VERSION_NO 201409292
#endif #endif
...@@ -83,9 +83,9 @@ typedef struct JsonbValue JsonbValue; ...@@ -83,9 +83,9 @@ typedef struct JsonbValue JsonbValue;
* buffer is accessed, but they can also be deep copied and passed around. * buffer is accessed, but they can also be deep copied and passed around.
* *
* Jsonb is a tree structure. Each node in the tree consists of a JEntry * Jsonb is a tree structure. Each node in the tree consists of a JEntry
* header, and a variable-length content. The JEntry header indicates what * header and a variable-length content (possibly of zero size). The JEntry
* kind of a node it is, e.g. a string or an array, and the offset and length * header indicates what kind of a node it is, e.g. a string or an array,
* of its variable-length portion within the container. * and provides the length of its variable-length portion.
* *
* The JEntry and the content of a node are not stored physically together. * The JEntry and the content of a node are not stored physically together.
* Instead, the container array or object has an array that holds the JEntrys * Instead, the container array or object has an array that holds the JEntrys
...@@ -95,40 +95,52 @@ typedef struct JsonbValue JsonbValue; ...@@ -95,40 +95,52 @@ typedef struct JsonbValue JsonbValue;
* hold its JEntry. Hence, no JEntry header is stored for the root node. It * hold its JEntry. Hence, no JEntry header is stored for the root node. It
* is implicitly known that the root node must be an array or an object, * is implicitly known that the root node must be an array or an object,
* so we can get away without the type indicator as long as we can distinguish * so we can get away without the type indicator as long as we can distinguish
* the two. For that purpose, both an array and an object begins with a uint32 * the two. For that purpose, both an array and an object begin with a uint32
* header field, which contains an JB_FOBJECT or JB_FARRAY flag. When a naked * header field, which contains an JB_FOBJECT or JB_FARRAY flag. When a naked
* scalar value needs to be stored as a Jsonb value, what we actually store is * scalar value needs to be stored as a Jsonb value, what we actually store is
* an array with one element, with the flags in the array's header field set * an array with one element, with the flags in the array's header field set
* to JB_FSCALAR | JB_FARRAY. * to JB_FSCALAR | JB_FARRAY.
* *
* To encode the length and offset of the variable-length portion of each
* node in a compact way, the JEntry stores only the end offset within the
* variable-length portion of the container node. For the first JEntry in the
* container's JEntry array, that equals to the length of the node data. The
* begin offset and length of the rest of the entries can be calculated using
* the end offset of the previous JEntry in the array.
*
* Overall, the Jsonb struct requires 4-bytes alignment. Within the struct, * Overall, the Jsonb struct requires 4-bytes alignment. Within the struct,
* the variable-length portion of some node types is aligned to a 4-byte * the variable-length portion of some node types is aligned to a 4-byte
* boundary, while others are not. When alignment is needed, the padding is * boundary, while others are not. When alignment is needed, the padding is
* in the beginning of the node that requires it. For example, if a numeric * in the beginning of the node that requires it. For example, if a numeric
* node is stored after a string node, so that the numeric node begins at * node is stored after a string node, so that the numeric node begins at
* offset 3, the variable-length portion of the numeric node will begin with * offset 3, the variable-length portion of the numeric node will begin with
* one padding byte. * one padding byte so that the actual numeric data is 4-byte aligned.
*/ */
/* /*
* Jentry format. * JEntry format.
*
* The least significant 28 bits store either the data length of the entry,
* or its end+1 offset from the start of the variable-length portion of the
* containing object. The next three bits store the type of the entry, and
* the high-order bit tells whether the least significant bits store a length
* or an offset.
*
* The reason for the offset-or-length complication is to compromise between
* access speed and data compressibility. In the initial design each JEntry
* always stored an offset, but this resulted in JEntry arrays with horrible
* compressibility properties, so that TOAST compression of a JSONB did not
* work well. Storing only lengths would greatly improve compressibility,
* but it makes random access into large arrays expensive (O(N) not O(1)).
* So what we do is store an offset in every JB_OFFSET_STRIDE'th JEntry and
* a length in the rest. This results in reasonably compressible data (as
* long as the stride isn't too small). We may have to examine as many as
* JB_OFFSET_STRIDE JEntrys in order to find out the offset or length of any
* given item, but that's still O(1) no matter how large the container is.
* *
* The least significant 28 bits store the end offset of the entry (see * We could avoid eating a flag bit for this purpose if we were to store
* JBE_ENDPOS, JBE_OFF, JBE_LEN macros below). The next three bits * the stride in the container header, or if we were willing to treat the
* are used to store the type of the entry. The most significant bit * stride as an unchangeable constant. Neither of those options is very
* is unused, and should be set to zero. * attractive though.
*/ */
typedef uint32 JEntry; typedef uint32 JEntry;
#define JENTRY_POSMASK 0x0FFFFFFF #define JENTRY_OFFLENMASK 0x0FFFFFFF
#define JENTRY_TYPEMASK 0x70000000 #define JENTRY_TYPEMASK 0x70000000
#define JENTRY_HAS_OFF 0x80000000
/* values stored in the type bits */ /* values stored in the type bits */
#define JENTRY_ISSTRING 0x00000000 #define JENTRY_ISSTRING 0x00000000
...@@ -138,7 +150,9 @@ typedef uint32 JEntry; ...@@ -138,7 +150,9 @@ typedef uint32 JEntry;
#define JENTRY_ISNULL 0x40000000 #define JENTRY_ISNULL 0x40000000
#define JENTRY_ISCONTAINER 0x50000000 /* array or object */ #define JENTRY_ISCONTAINER 0x50000000 /* array or object */
/* Note possible multiple evaluations */ /* Access macros. Note possible multiple evaluations */
#define JBE_OFFLENFLD(je_) ((je_) & JENTRY_OFFLENMASK)
#define JBE_HAS_OFF(je_) (((je_) & JENTRY_HAS_OFF) != 0)
#define JBE_ISSTRING(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISSTRING) #define JBE_ISSTRING(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISSTRING)
#define JBE_ISNUMERIC(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISNUMERIC) #define JBE_ISNUMERIC(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISNUMERIC)
#define JBE_ISCONTAINER(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISCONTAINER) #define JBE_ISCONTAINER(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISCONTAINER)
...@@ -147,20 +161,34 @@ typedef uint32 JEntry; ...@@ -147,20 +161,34 @@ typedef uint32 JEntry;
#define JBE_ISBOOL_FALSE(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_FALSE) #define JBE_ISBOOL_FALSE(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_FALSE)
#define JBE_ISBOOL(je_) (JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_)) #define JBE_ISBOOL(je_) (JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_))
/* Macro for advancing an offset variable to the next JEntry */
#define JBE_ADVANCE_OFFSET(offset, je) \
do { \
JEntry je_ = (je); \
if (JBE_HAS_OFF(je_)) \
(offset) = JBE_OFFLENFLD(je_); \
else \
(offset) += JBE_OFFLENFLD(je_); \
} while(0)
/* /*
* Macros for getting the offset and length of an element. Note multiple * We store an offset, not a length, every JB_OFFSET_STRIDE children.
* evaluations and access to prior array element. * Caution: this macro should only be referenced when creating a JSONB
* value. When examining an existing value, pay attention to the HAS_OFF
* bits instead. This allows changes in the offset-placement heuristic
* without breaking on-disk compatibility.
*/ */
#define JBE_ENDPOS(je_) ((je_) & JENTRY_POSMASK) #define JB_OFFSET_STRIDE 32
#define JBE_OFF(ja, i) ((i) == 0 ? 0 : JBE_ENDPOS((ja)[i - 1]))
#define JBE_LEN(ja, i) ((i) == 0 ? JBE_ENDPOS((ja)[i]) \
: JBE_ENDPOS((ja)[i]) - JBE_ENDPOS((ja)[i - 1]))
/* /*
* A jsonb array or object node, within a Jsonb Datum. * A jsonb array or object node, within a Jsonb Datum.
* *
* An array has one child for each element. An object has two children for * An array has one child for each element, stored in array order.
* each key/value pair. *
* An object has two children for each key/value pair. The keys all appear
* first, in key sort order; then the values appear, in an order matching the
* key order. This arrangement keeps the keys compact in memory, making a
* search for a particular key more cache-friendly.
*/ */
typedef struct JsonbContainer typedef struct JsonbContainer
{ {
...@@ -172,8 +200,8 @@ typedef struct JsonbContainer ...@@ -172,8 +200,8 @@ typedef struct JsonbContainer
} JsonbContainer; } JsonbContainer;
/* flags for the header-field in JsonbContainer */ /* flags for the header-field in JsonbContainer */
#define JB_CMASK 0x0FFFFFFF #define JB_CMASK 0x0FFFFFFF /* mask for count field */
#define JB_FSCALAR 0x10000000 #define JB_FSCALAR 0x10000000 /* flag bits */
#define JB_FOBJECT 0x20000000 #define JB_FOBJECT 0x20000000
#define JB_FARRAY 0x40000000 #define JB_FARRAY 0x40000000
...@@ -248,18 +276,20 @@ struct JsonbValue ...@@ -248,18 +276,20 @@ struct JsonbValue
(jsonbval)->type <= jbvBool) (jsonbval)->type <= jbvBool)
/* /*
* Pair within an Object. * Key/value pair within an Object.
* *
* Pairs with duplicate keys are de-duplicated. We store the order for the * This struct type is only used briefly while constructing a Jsonb; it is
* benefit of doing so in a well-defined way with respect to the original * *not* the on-disk representation.
* observed order (which is "last observed wins"). This is only used briefly *
* when originally constructing a Jsonb. * Pairs with duplicate keys are de-duplicated. We store the originally
* observed pair ordering for the purpose of removing duplicates in a
* well-defined way (which is "last observed wins").
*/ */
struct JsonbPair struct JsonbPair
{ {
JsonbValue key; /* Must be a jbvString */ JsonbValue key; /* Must be a jbvString */
JsonbValue value; /* May be of any type */ JsonbValue value; /* May be of any type */
uint32 order; /* preserves order of pairs with equal keys */ uint32 order; /* Pair's index in original sequence */
}; };
/* Conversion state used when parsing Jsonb from text, or for type coercion */ /* Conversion state used when parsing Jsonb from text, or for type coercion */
...@@ -287,20 +317,25 @@ typedef struct JsonbIterator ...@@ -287,20 +317,25 @@ typedef struct JsonbIterator
{ {
/* Container being iterated */ /* Container being iterated */
JsonbContainer *container; JsonbContainer *container;
uint32 nElems; /* Number of elements in children array (will be uint32 nElems; /* Number of elements in children array (will
* nPairs for objects) */ * be nPairs for objects) */
bool isScalar; /* Pseudo-array scalar value? */ bool isScalar; /* Pseudo-array scalar value? */
JEntry *children; JEntry *children; /* JEntrys for child nodes */
/* Data proper. This points to the beginning of the variable-length data */
char *dataProper;
/* Current item in buffer (up to nElems, but must * 2 for objects) */ /* Current item in buffer (up to nElems) */
int i; int curIndex;
/* Data offset corresponding to current item */
uint32 curDataOffset;
/* /*
* Data proper. This points just past end of children array. * If the container is an object, we want to return keys and values
* We use the JBE_OFF() macro on the Jentrys to find offsets of each * alternately; so curDataOffset points to the current key, and
* child in this area. * curValueOffset points to the current value.
*/ */
char *dataProper; uint32 curValueOffset;
/* Private state */ /* Private state */
JsonbIterState state; JsonbIterState state;
...@@ -344,6 +379,8 @@ extern Datum gin_consistent_jsonb_path(PG_FUNCTION_ARGS); ...@@ -344,6 +379,8 @@ extern Datum gin_consistent_jsonb_path(PG_FUNCTION_ARGS);
extern Datum gin_triconsistent_jsonb_path(PG_FUNCTION_ARGS); extern Datum gin_triconsistent_jsonb_path(PG_FUNCTION_ARGS);
/* Support functions */ /* Support functions */
extern uint32 getJsonbOffset(const JsonbContainer *jc, int index);
extern uint32 getJsonbLength(const JsonbContainer *jc, int index);
extern int compareJsonbContainers(JsonbContainer *a, JsonbContainer *b); extern int compareJsonbContainers(JsonbContainer *a, JsonbContainer *b);
extern JsonbValue *findJsonbValueFromContainer(JsonbContainer *sheader, extern JsonbValue *findJsonbValueFromContainer(JsonbContainer *sheader,
uint32 flags, uint32 flags,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment