Commit 46dddf76 authored by Tom Lane's avatar Tom Lane

Improve key representation for GIN jsonb_ops, and fix existence-search bug.

Change the key representation so that values that would exceed 127 bytes
are hashed into short strings, and so that the original JSON datatype of
each value is recorded in the index.  The hashing rule eliminates the major
objection to having this opclass be the default for jsonb, namely that it
could fail for plausible input data (due to GIN's restrictions on maximum
key length).  Preserving datatype information doesn't really buy us much
right now, but it requires no extra space compared to the previous way,
and it might be useful later.

Also, change the consistency-checking functions to request recheck for
exists (jsonb ? text) and related operators.  The original analysis that
this is an exactly checkable query was incorrect, since the index does
not preserve information about whether a key appears at top level in
the indexed JSON object.  Add a test case demonstrating the problem.

Make some other, mostly cosmetic improvements to the code in jsonb_gin.c
as well.

catversion bump due to on-disk data format change in jsonb_ops indexes.
parent ff7bbb01
This diff is collapsed.
......@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 201405051
#define CATALOG_VERSION_NO 201405091
#endif
......@@ -29,25 +29,41 @@ typedef enum
WJB_END_OBJECT
} JsonbIteratorToken;
/*
* When using a GIN index for jsonb, we choose to index both keys and values.
* The storage format is text, with K, or V prepended to the string to indicate
* key/element or value/element.
*
* Jsonb Keys and string array elements are treated equivalently when
* serialized to text index storage. One day we may wish to create an opclass
* that only indexes values, but for now keys and values are stored in GIN
* indexes in a way that doesn't really consider their relationship to each
* other.
*/
#define JKEYELEM 'K'
#define JVAL 'V'
/* Strategy numbers for GIN index opclasses */
#define JsonbContainsStrategyNumber 7
#define JsonbExistsStrategyNumber 9
#define JsonbExistsAnyStrategyNumber 10
#define JsonbExistsAllStrategyNumber 11
/*
* In the standard jsonb_ops GIN opclass for jsonb, we choose to index both
* keys and values. The storage format is text. The first byte of the text
* string distinguishes whether this is a key (always a string), null value,
* boolean value, numeric value, or string value. However, array elements
* that are strings are marked as though they were keys; this imprecision
* supports the definition of the "exists" operator, which treats array
* elements like keys. The remainder of the text string is empty for a null
* value, "t" or "f" for a boolean value, a normalized print representation of
* a numeric value, or the text of a string value. However, if the length of
* this text representation would exceed JGIN_MAXLENGTH bytes, we instead hash
* the text representation and store an 8-hex-digit representation of the
* uint32 hash value, marking the prefix byte with an additional bit to
* distinguish that this has happened. Hashing long strings saves space and
* ensures that we won't overrun the maximum entry length for a GIN index.
* (But JGIN_MAXLENGTH is quite a bit shorter than GIN's limit. It's chosen
* to ensure that the on-disk text datum will have a short varlena header.)
* Note that when any hashed item appears in a query, we must recheck index
* matches against the heap tuple; currently, this costs nothing because we
* must always recheck for other reasons.
*/
#define JGINFLAG_KEY 0x01 /* key (or string array element) */
#define JGINFLAG_NULL 0x02 /* null value */
#define JGINFLAG_BOOL 0x03 /* boolean value */
#define JGINFLAG_NUM 0x04 /* numeric value */
#define JGINFLAG_STR 0x05 /* string value (if not an array element) */
#define JGINFLAG_HASHED 0x10 /* OR'd into flag if value was hashed */
#define JGIN_MAXLENGTH 125 /* max length of text part before hashing */
/* Convenience macros */
#define DatumGetJsonb(d) ((Jsonb *) PG_DETOAST_DATUM(d))
#define JsonbGetDatum(p) PointerGetDatum(p)
......@@ -332,12 +348,12 @@ extern Datum gin_consistent_jsonb_hash(PG_FUNCTION_ARGS);
extern Datum gin_triconsistent_jsonb_hash(PG_FUNCTION_ARGS);
/* Support functions */
extern int compareJsonbContainers(JsonbContainer *a, JsonbContainer *b);
extern int compareJsonbContainers(JsonbContainer *a, JsonbContainer *b);
extern JsonbValue *findJsonbValueFromContainer(JsonbContainer *sheader,
uint32 flags,
JsonbValue *key);
uint32 flags,
JsonbValue *key);
extern JsonbValue *getIthJsonbValueFromContainer(JsonbContainer *sheader,
uint32 i);
uint32 i);
extern JsonbValue *pushJsonbValue(JsonbParseState **pstate,
JsonbIteratorToken seq, JsonbValue *scalarVal);
extern JsonbIterator *JsonbIteratorInit(JsonbContainer *container);
......
......@@ -1006,4 +1006,7 @@
{"wait":null, "line":1000}
{"age":25}
{"age":25.0}
{"foo": {"bar": "baz"}}
{"foo": {"blah": "baz"}}
{"fool": {"bar": "baz"}}
{}
......@@ -1483,6 +1483,12 @@ SELECT count(*) FROM testjsonb WHERE j ? 'public';
194
(1 row)
SELECT count(*) FROM testjsonb WHERE j ? 'bar';
count
-------
0
(1 row)
SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled'];
count
-------
......@@ -1543,7 +1549,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"array":["bar"]}';
SELECT count(*) FROM testjsonb WHERE j @> '{}';
count
-------
1009
1012
(1 row)
SELECT count(*) FROM testjsonb WHERE j ? 'public';
......@@ -1552,6 +1558,12 @@ SELECT count(*) FROM testjsonb WHERE j ? 'public';
194
(1 row)
SELECT count(*) FROM testjsonb WHERE j ? 'bar';
count
-------
0
(1 row)
SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled'];
count
-------
......@@ -1591,7 +1603,7 @@ RESET enable_seqscan;
SELECT count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow;
count
-------
4788
4791
(1 row)
SELECT key, count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow GROUP BY key ORDER BY count DESC, key;
......@@ -1621,20 +1633,22 @@ SELECT key, count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow GRO
abstract | 161
array | 5
age | 2
(24 rows)
foo | 2
fool | 1
(26 rows)
-- sort/hash
SELECT count(distinct j) FROM testjsonb;
count
-------
891
894
(1 row)
SET enable_hashagg = off;
SELECT count(*) FROM (SELECT j FROM (SELECT * FROM testjsonb UNION ALL SELECT * FROM testjsonb) js GROUP BY j) js2;
count
-------
891
894
(1 row)
SET enable_hashagg = on;
......@@ -1642,7 +1656,7 @@ SET enable_sort = off;
SELECT count(*) FROM (SELECT j FROM (SELECT * FROM testjsonb UNION ALL SELECT * FROM testjsonb) js GROUP BY j) js2;
count
-------
891
894
(1 row)
SELECT distinct * FROM (values (jsonb '{}' || ''),('{}')) v(j);
......@@ -1709,7 +1723,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"age":25.0}';
SELECT count(*) FROM testjsonb WHERE j @> '{}';
count
-------
1009
1012
(1 row)
RESET enable_seqscan;
......
......@@ -1483,6 +1483,12 @@ SELECT count(*) FROM testjsonb WHERE j ? 'public';
194
(1 row)
SELECT count(*) FROM testjsonb WHERE j ? 'bar';
count
-------
0
(1 row)
SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled'];
count
-------
......@@ -1543,7 +1549,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"array":["bar"]}';
SELECT count(*) FROM testjsonb WHERE j @> '{}';
count
-------
1009
1012
(1 row)
SELECT count(*) FROM testjsonb WHERE j ? 'public';
......@@ -1552,6 +1558,12 @@ SELECT count(*) FROM testjsonb WHERE j ? 'public';
194
(1 row)
SELECT count(*) FROM testjsonb WHERE j ? 'bar';
count
-------
0
(1 row)
SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled'];
count
-------
......@@ -1591,7 +1603,7 @@ RESET enable_seqscan;
SELECT count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow;
count
-------
4788
4791
(1 row)
SELECT key, count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow GROUP BY key ORDER BY count DESC, key;
......@@ -1621,20 +1633,22 @@ SELECT key, count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow GRO
abstract | 161
array | 5
age | 2
(24 rows)
foo | 2
fool | 1
(26 rows)
-- sort/hash
SELECT count(distinct j) FROM testjsonb;
count
-------
891
894
(1 row)
SET enable_hashagg = off;
SELECT count(*) FROM (SELECT j FROM (SELECT * FROM testjsonb UNION ALL SELECT * FROM testjsonb) js GROUP BY j) js2;
count
-------
891
894
(1 row)
SET enable_hashagg = on;
......@@ -1642,7 +1656,7 @@ SET enable_sort = off;
SELECT count(*) FROM (SELECT j FROM (SELECT * FROM testjsonb UNION ALL SELECT * FROM testjsonb) js GROUP BY j) js2;
count
-------
891
894
(1 row)
SELECT distinct * FROM (values (jsonb '{}' || ''),('{}')) v(j);
......@@ -1709,7 +1723,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"age":25.0}';
SELECT count(*) FROM testjsonb WHERE j @> '{}';
count
-------
1009
1012
(1 row)
RESET enable_seqscan;
......
......@@ -334,6 +334,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"wait":"CC", "public":true}';
SELECT count(*) FROM testjsonb WHERE j @> '{"age":25}';
SELECT count(*) FROM testjsonb WHERE j @> '{"age":25.0}';
SELECT count(*) FROM testjsonb WHERE j ? 'public';
SELECT count(*) FROM testjsonb WHERE j ? 'bar';
SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled'];
SELECT count(*) FROM testjsonb WHERE j ?& ARRAY['public','disabled'];
......@@ -350,6 +351,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"array":["bar"]}';
-- excercise GIN_SEARCH_MODE_ALL
SELECT count(*) FROM testjsonb WHERE j @> '{}';
SELECT count(*) FROM testjsonb WHERE j ? 'public';
SELECT count(*) FROM testjsonb WHERE j ? 'bar';
SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled'];
SELECT count(*) FROM testjsonb WHERE j ?& ARRAY['public','disabled'];
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment