Commit 7577dd84 authored by Tomas Vondra's avatar Tomas Vondra

Properly detoast data in brin_form_tuple

brin_form_tuple failed to consider the values may be toasted, inserting
the toast pointer into the index. This may easily result in index
corruption, as the toast data may be deleted and cleaned up by vacuum.
The cleanup however does not care about indexes, leaving invalid toast
pointers behind, which triggers errors like this:

  ERROR:  missing chunk number 0 for toast value 16433 in pg_toast_16426

A less severe consequence are inconsistent failures due to the index row
being too large, depending on whether brin_form_tuple operated on plain
or toasted version of the row. For example

    CREATE TABLE t (val TEXT);
    INSERT INTO t VALUES ('... long value ...')
    CREATE INDEX idx ON t USING brin (val);

would likely succeed, as the row would likely include toast pointer.
Switching the order of INSERT and CREATE INDEX would likely fail:

    ERROR:  index row size 8712 exceeds maximum 8152 for index "idx"

because this happens before the row values are toasted.

The bug exists since PostgreSQL 9.5 where BRIN indexes were introduced.
So backpatch all the way back.

Author: Tomas Vondra
Reviewed-by: Alvaro Herrera
Backpatch-through: 9.5
Discussion: https://postgr.es/m/20201001184133.oq5uq75sb45pu3aw@development
Discussion: https://postgr.es/m/20201104010544.zexj52mlldagzowv%40development
parent eeda7f63
...@@ -32,12 +32,23 @@ ...@@ -32,12 +32,23 @@
#include "postgres.h" #include "postgres.h"
#include "access/brin_tuple.h" #include "access/brin_tuple.h"
#include "access/detoast.h"
#include "access/heaptoast.h"
#include "access/htup_details.h" #include "access/htup_details.h"
#include "access/toast_internals.h"
#include "access/tupdesc.h" #include "access/tupdesc.h"
#include "access/tupmacs.h" #include "access/tupmacs.h"
#include "utils/datum.h" #include "utils/datum.h"
#include "utils/memutils.h" #include "utils/memutils.h"
/*
* This enables de-toasting of index entries. Needed until VACUUM is
* smart enough to rebuild indexes from scratch.
*/
#define TOAST_INDEX_HACK
static inline void brin_deconstruct_tuple(BrinDesc *brdesc, static inline void brin_deconstruct_tuple(BrinDesc *brdesc,
char *tp, bits8 *nullbits, bool nulls, char *tp, bits8 *nullbits, bool nulls,
Datum *values, bool *allnulls, bool *hasnulls); Datum *values, bool *allnulls, bool *hasnulls);
...@@ -99,6 +110,12 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple, ...@@ -99,6 +110,12 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
Size len, Size len,
hoff, hoff,
data_len; data_len;
int i;
#ifdef TOAST_INDEX_HACK
Datum *untoasted_values;
int nuntoasted = 0;
#endif
Assert(brdesc->bd_totalstored > 0); Assert(brdesc->bd_totalstored > 0);
...@@ -107,6 +124,10 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple, ...@@ -107,6 +124,10 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
phony_nullbitmap = (bits8 *) phony_nullbitmap = (bits8 *)
palloc(sizeof(bits8) * BITMAPLEN(brdesc->bd_totalstored)); palloc(sizeof(bits8) * BITMAPLEN(brdesc->bd_totalstored));
#ifdef TOAST_INDEX_HACK
untoasted_values = (Datum *) palloc(sizeof(Datum) * brdesc->bd_totalstored);
#endif
/* /*
* Set up the values/nulls arrays for heap_fill_tuple * Set up the values/nulls arrays for heap_fill_tuple
*/ */
...@@ -138,10 +159,84 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple, ...@@ -138,10 +159,84 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
if (tuple->bt_columns[keyno].bv_hasnulls) if (tuple->bt_columns[keyno].bv_hasnulls)
anynulls = true; anynulls = true;
/*
* Now obtain the values of each stored datum. Note that some values
* might be toasted, and we cannot rely on the original heap values
* sticking around forever, so we must detoast them. Also try to
* compress them.
*/
for (datumno = 0; for (datumno = 0;
datumno < brdesc->bd_info[keyno]->oi_nstored; datumno < brdesc->bd_info[keyno]->oi_nstored;
datumno++) datumno++)
values[idxattno++] = tuple->bt_columns[keyno].bv_values[datumno]; {
Datum value = tuple->bt_columns[keyno].bv_values[datumno];
#ifdef TOAST_INDEX_HACK
/* We must look at the stored type, not at the index descriptor. */
TypeCacheEntry *atttype = brdesc->bd_info[keyno]->oi_typcache[datumno];
/* Do we need to free the value at the end? */
bool free_value = false;
/* For non-varlena types we don't need to do anything special */
if (atttype->typlen != -1)
{
values[idxattno++] = value;
continue;
}
/*
* Do nothing if value is not of varlena type. We don't need to
* care about NULL values here, thanks to bv_allnulls above.
*
* If value is stored EXTERNAL, must fetch it so we are not
* depending on outside storage.
*
* XXX Is this actually true? Could it be that the summary is
* NULL even for range with non-NULL data? E.g. degenerate bloom
* filter may be thrown away, etc.
*/
if (VARATT_IS_EXTERNAL(DatumGetPointer(value)))
{
value = PointerGetDatum(detoast_external_attr((struct varlena *)
DatumGetPointer(value)));
free_value = true;
}
/*
* If value is above size target, and is of a compressible datatype,
* try to compress it in-line.
*/
if (!VARATT_IS_EXTENDED(DatumGetPointer(value)) &&
VARSIZE(DatumGetPointer(value)) > TOAST_INDEX_TARGET &&
(atttype->typstorage == TYPSTORAGE_EXTENDED ||
atttype->typstorage == TYPSTORAGE_MAIN))
{
Datum cvalue = toast_compress_datum(value);
if (DatumGetPointer(cvalue) != NULL)
{
/* successful compression */
if (free_value)
pfree(DatumGetPointer(value));
value = cvalue;
free_value = true;
}
}
/*
* If we untoasted / compressed the value, we need to free it
* after forming the index tuple.
*/
if (free_value)
untoasted_values[nuntoasted++] = value;
#endif
values[idxattno++] = value;
}
} }
/* Assert we did not overrun temp arrays */ /* Assert we did not overrun temp arrays */
...@@ -193,6 +288,11 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple, ...@@ -193,6 +288,11 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
pfree(nulls); pfree(nulls);
pfree(phony_nullbitmap); pfree(phony_nullbitmap);
#ifdef TOAST_INDEX_HACK
for (i = 0; i < nuntoasted; i++)
pfree(DatumGetPointer(untoasted_values[i]));
#endif
/* /*
* Now fill in the real null bitmasks. allnulls first. * Now fill in the real null bitmasks. allnulls first.
*/ */
......
...@@ -526,3 +526,44 @@ EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE b = 1; ...@@ -526,3 +526,44 @@ EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE b = 1;
Filter: (b = 1) Filter: (b = 1)
(2 rows) (2 rows)
-- make sure data are properly de-toasted in BRIN index
CREATE TABLE brintest_3 (a text, b text, c text, d text);
-- long random strings (~2000 chars each, so ~6kB for min/max on two
-- columns) to trigger toasting
WITH rand_value AS (SELECT string_agg(md5(i::text),'') AS val FROM generate_series(1,60) s(i))
INSERT INTO brintest_3
SELECT val, val, val, val FROM rand_value;
CREATE INDEX brin_test_toast_idx ON brintest_3 USING brin (b, c);
DELETE FROM brintest_3;
-- We need to wait a bit for all transactions to complete, so that the
-- vacuum actually removes the TOAST rows. Creating an index concurrently
-- is a one way to achieve that, because it does exactly such wait.
CREATE INDEX CONCURRENTLY brin_test_temp_idx ON brintest_3(a);
DROP INDEX brin_test_temp_idx;
-- vacuum the table, to discard TOAST data
VACUUM brintest_3;
-- retry insert with a different random-looking (but deterministic) value
-- the value is different, and so should replace either min or max in the
-- brin summary
WITH rand_value AS (SELECT string_agg(md5((-i)::text),'') AS val FROM generate_series(1,60) s(i))
INSERT INTO brintest_3
SELECT val, val, val, val FROM rand_value;
-- now try some queries, accessing the brin index
SET enable_seqscan = off;
EXPLAIN (COSTS OFF)
SELECT * FROM brintest_3 WHERE b < '0';
QUERY PLAN
------------------------------------------------
Bitmap Heap Scan on brintest_3
Recheck Cond: (b < '0'::text)
-> Bitmap Index Scan on brin_test_toast_idx
Index Cond: (b < '0'::text)
(4 rows)
SELECT * FROM brintest_3 WHERE b < '0';
a | b | c | d
---+---+---+---
(0 rows)
DROP TABLE brintest_3;
RESET enable_seqscan;
...@@ -470,3 +470,42 @@ VACUUM ANALYZE brin_test; ...@@ -470,3 +470,42 @@ VACUUM ANALYZE brin_test;
EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE a = 1; EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE a = 1;
-- Ensure brin index is not used when values are not correlated -- Ensure brin index is not used when values are not correlated
EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE b = 1; EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE b = 1;
-- make sure data are properly de-toasted in BRIN index
CREATE TABLE brintest_3 (a text, b text, c text, d text);
-- long random strings (~2000 chars each, so ~6kB for min/max on two
-- columns) to trigger toasting
WITH rand_value AS (SELECT string_agg(md5(i::text),'') AS val FROM generate_series(1,60) s(i))
INSERT INTO brintest_3
SELECT val, val, val, val FROM rand_value;
CREATE INDEX brin_test_toast_idx ON brintest_3 USING brin (b, c);
DELETE FROM brintest_3;
-- We need to wait a bit for all transactions to complete, so that the
-- vacuum actually removes the TOAST rows. Creating an index concurrently
-- is a one way to achieve that, because it does exactly such wait.
CREATE INDEX CONCURRENTLY brin_test_temp_idx ON brintest_3(a);
DROP INDEX brin_test_temp_idx;
-- vacuum the table, to discard TOAST data
VACUUM brintest_3;
-- retry insert with a different random-looking (but deterministic) value
-- the value is different, and so should replace either min or max in the
-- brin summary
WITH rand_value AS (SELECT string_agg(md5((-i)::text),'') AS val FROM generate_series(1,60) s(i))
INSERT INTO brintest_3
SELECT val, val, val, val FROM rand_value;
-- now try some queries, accessing the brin index
SET enable_seqscan = off;
EXPLAIN (COSTS OFF)
SELECT * FROM brintest_3 WHERE b < '0';
SELECT * FROM brintest_3 WHERE b < '0';
DROP TABLE brintest_3;
RESET enable_seqscan;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment