Commit 14d02f0b authored by Heikki Linnakangas's avatar Heikki Linnakangas

Rewrite the way GIN posting lists are packed on a page, to reduce WAL volume.

Inserting (in retail) into the new 9.4 format GIN posting tree created much
larger WAL records than in 9.3. The previous strategy to WAL logging was
basically to log the whole page on each change, with the exception of
completely unmodified segments up to the first modified one. That was not
too bad when appending to the end of the page, as only the last segment had
to be WAL-logged, but per Fujii Masao's testing, even that produced 2x the
WAL volume that 9.3 did.

The new strategy is to keep track of changes to the posting lists in a more
fine-grained fashion, and also make the repacking" code smarter to avoid
decoding and re-encoding segments unnecessarily.
parent 0cfa34c2
This diff is collapsed.
......@@ -298,9 +298,10 @@ ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_
}
/* copy the first item */
Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first)));
Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0);
result[ndecoded] = segment->first;
ndecoded++;
Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first)));
val = itemptr_to_uint64(&segment->first);
ptr = segment->bytes;
......
......@@ -145,15 +145,158 @@ ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rda
static void
ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data)
{
Pointer segment;
/* Copy the new data to the right place */
segment = ((Pointer) GinDataLeafPageGetPostingList(page))
+ data->unmodifiedsize;
memcpy(segment, data->newdata, data->length - data->unmodifiedsize);
GinDataLeafPageSetPostingListSize(page, data->length);
GinPageSetCompressed(page);
GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
int actionno;
int segno;
GinPostingList *oldseg;
Pointer segmentend;
char *walbuf;
int totalsize;
/*
* If the page is in pre-9.4 format, convert to new format first.
*/
if (!GinPageIsCompressed(page))
{
ItemPointer uncompressed = (ItemPointer) GinDataPageGetData(page);
int nuncompressed = GinPageGetOpaque(page)->maxoff;
int npacked;
GinPostingList *plist;
plist = ginCompressPostingList(uncompressed, nuncompressed,
BLCKSZ, &npacked);
Assert(npacked == nuncompressed);
totalsize = SizeOfGinPostingList(plist);
memcpy(GinDataLeafPageGetPostingList(page), plist, totalsize);
GinDataLeafPageSetPostingListSize(page, totalsize);
GinPageSetCompressed(page);
GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
}
oldseg = GinDataLeafPageGetPostingList(page);
segmentend = (Pointer) oldseg + GinDataLeafPageGetPostingListSize(page);
segno = 0;
walbuf = ((char *) data) + sizeof(ginxlogRecompressDataLeaf);
for (actionno = 0; actionno < data->nactions; actionno++)
{
uint8 a_segno = *((uint8 *) (walbuf++));
uint8 a_action = *((uint8 *) (walbuf++));
GinPostingList *newseg = NULL;
int newsegsize = 0;
ItemPointerData *items = NULL;
uint16 nitems = 0;
ItemPointerData *olditems;
int nolditems;
ItemPointerData *newitems;
int nnewitems;
int segsize;
Pointer segptr;
int szleft;
/* Extract all the information we need from the WAL record */
if (a_action == GIN_SEGMENT_INSERT ||
a_action == GIN_SEGMENT_REPLACE)
{
newseg = (GinPostingList *) walbuf;
newsegsize = SizeOfGinPostingList(newseg);
walbuf += SHORTALIGN(newsegsize);
}
if (a_action == GIN_SEGMENT_ADDITEMS)
{
memcpy(&nitems, walbuf, sizeof(uint16));
walbuf += sizeof(uint16);
items = (ItemPointerData *) walbuf;
walbuf += nitems * sizeof(ItemPointerData);
}
/* Skip to the segment that this action concerns */
Assert(segno <= a_segno);
while (segno < a_segno)
{
oldseg = GinNextPostingListSegment(oldseg);
segno++;
}
/*
* ADDITEMS action is handled like REPLACE, but the new segment to
* replace the old one is reconstructed using the old segment from
* disk and the new items from the WAL record.
*/
if (a_action == GIN_SEGMENT_ADDITEMS)
{
int npacked;
olditems = ginPostingListDecode(oldseg, &nolditems);
newitems = ginMergeItemPointers(items, nitems,
olditems, nolditems,
&nnewitems);
Assert(nnewitems == nolditems + nitems);
newseg = ginCompressPostingList(newitems, nnewitems,
BLCKSZ, &npacked);
Assert(npacked == nnewitems);
newsegsize = SizeOfGinPostingList(newseg);
a_action = GIN_SEGMENT_REPLACE;
}
segptr = (Pointer) oldseg;
if (segptr != segmentend)
segsize = SizeOfGinPostingList(oldseg);
else
{
/*
* Positioned after the last existing segment. Only INSERTs
* expected here.
*/
Assert(a_action == GIN_SEGMENT_INSERT);
segsize = 0;
}
szleft = segmentend - segptr;
switch (a_action)
{
case GIN_SEGMENT_DELETE:
memmove(segptr, segptr + segsize, szleft - segsize);
segmentend -= segsize;
segno++;
break;
case GIN_SEGMENT_INSERT:
/* make room for the new segment */
memmove(segptr + newsegsize, segptr, szleft);
/* copy the new segment in place */
memcpy(segptr, newseg, newsegsize);
segmentend += newsegsize;
segptr += newsegsize;
break;
case GIN_SEGMENT_REPLACE:
/* shift the segments that follow */
memmove(segptr + newsegsize,
segptr + segsize,
szleft - segsize);
/* copy the replacement segment in place */
memcpy(segptr, newseg, newsegsize);
segmentend -= segsize;
segmentend += newsegsize;
segptr += newsegsize;
segno++;
break;
default:
elog(ERROR, "unexpected GIN leaf action: %u", a_action);
}
oldseg = (GinPostingList *) segptr;
}
totalsize = segmentend - (Pointer) GinDataLeafPageGetPostingList(page);
GinDataLeafPageSetPostingListSize(page, totalsize);
}
static void
......
......@@ -25,6 +25,57 @@ desc_node(StringInfo buf, RelFileNode node, BlockNumber blkno)
node.spcNode, node.dbNode, node.relNode, blkno);
}
static void
desc_recompress_leaf(StringInfo buf, ginxlogRecompressDataLeaf *insertData)
{
int i;
char *walbuf = ((char *) insertData) + sizeof(ginxlogRecompressDataLeaf);
appendStringInfo(buf, " %d segments:", (int) insertData->nactions);
for (i = 0; i < insertData->nactions; i++)
{
uint8 a_segno = *((uint8 *) (walbuf++));
uint8 a_action = *((uint8 *) (walbuf++));
uint16 nitems = 0;
int newsegsize = 0;
if (a_action == GIN_SEGMENT_INSERT ||
a_action == GIN_SEGMENT_REPLACE)
{
newsegsize = SizeOfGinPostingList((GinPostingList *) walbuf);
walbuf += SHORTALIGN(newsegsize);
}
if (a_action == GIN_SEGMENT_ADDITEMS)
{
memcpy(&nitems, walbuf, sizeof(uint16));
walbuf += sizeof(uint16);
walbuf += nitems * sizeof(ItemPointerData);
}
switch(a_action)
{
case GIN_SEGMENT_ADDITEMS:
appendStringInfo(buf, " %d (add %d items)", a_segno, nitems);
break;
case GIN_SEGMENT_DELETE:
appendStringInfo(buf, " %d (delete)", a_segno);
break;
case GIN_SEGMENT_INSERT:
appendStringInfo(buf, " %d (insert)", a_segno);
break;
case GIN_SEGMENT_REPLACE:
appendStringInfo(buf, " %d (replace)", a_segno);
break;
default:
appendStringInfo(buf, " %d unknown action %d ???", a_segno, a_action);
/* cannot decode unrecognized actions further */
return;
}
}
}
void
gin_desc(StringInfo buf, uint8 xl_info, char *rec)
{
......@@ -70,9 +121,10 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec)
ginxlogRecompressDataLeaf *insertData =
(ginxlogRecompressDataLeaf *) payload;
appendStringInfo(buf, " unmodified: %u length: %u (compressed)",
insertData->unmodifiedsize,
insertData->length);
if (xl_info & XLR_BKP_BLOCK(0))
appendStringInfo(buf, " (full page image)");
else
desc_recompress_leaf(buf, insertData);
}
else
{
......@@ -105,9 +157,10 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec)
ginxlogVacuumDataLeafPage *xlrec = (ginxlogVacuumDataLeafPage *) rec;
appendStringInfoString(buf, "Vacuum data leaf page, ");
desc_node(buf, xlrec->node, xlrec->blkno);
appendStringInfo(buf, " unmodified: %u length: %u",
xlrec->data.unmodifiedsize,
xlrec->data.length);
if (xl_info & XLR_BKP_BLOCK(0))
appendStringInfo(buf, " (full page image)");
else
desc_recompress_leaf(buf, &xlrec->data);
}
break;
case XLOG_GIN_DELETE_PAGE:
......
......@@ -406,7 +406,8 @@ typedef struct
* whose split this insertion finishes. As BlockIdData[2] (beware of adding
* fields before this that would make them not 16-bit aligned)
*
* 2. one of the following structs, depending on tree type.
* 2. an ginxlogInsertEntry or ginxlogRecompressDataLeaf struct, depending
* on tree type.
*
* NB: the below structs are only 16-bit aligned when appended to a
* ginxlogInsert struct! Beware of adding fields to them that require
......@@ -421,15 +422,39 @@ typedef struct
IndexTupleData tuple; /* variable length */
} ginxlogInsertEntry;
typedef struct
{
uint16 length;
uint16 unmodifiedsize;
uint16 nactions;
/* compressed segments, variable length */
char newdata[1];
/* Variable number of 'actions' follow */
} ginxlogRecompressDataLeaf;
/*
* Note: this struct is currently not used in code, and only acts as
* documentation. The WAL record format is as specified here, but the code
* uses straight access through a Pointer and memcpy to read/write these.
*/
typedef struct
{
uint8 segno; /* segment this action applies to */
char type; /* action type (see below) */
/*
* Action-specific data follows. For INSERT and REPLACE actions that is a
* GinPostingList struct. For ADDITEMS, a uint16 for the number of items
* added, followed by the items themselves as ItemPointers. DELETE actions
* have no further data.
*/
} ginxlogSegmentAction;
/* Action types */
#define GIN_SEGMENT_UNMODIFIED 0 /* no action (not used in WAL records) */
#define GIN_SEGMENT_DELETE 1 /* a whole segment is removed */
#define GIN_SEGMENT_INSERT 2 /* a whole segment is added */
#define GIN_SEGMENT_REPLACE 3 /* a segment is replaced */
#define GIN_SEGMENT_ADDITEMS 4 /* items are added to existing segment */
typedef struct
{
OffsetNumber offset;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment