Commit a4ccc1ce authored by Simon Riggs's avatar Simon Riggs

Generational memory allocator

Add new style of memory allocator, known as Generational
appropriate for use in cases where memory is allocated
and then freed in roughly oldest first order (FIFO).

Use new allocator for logical decoding’s reorderbuffer
to significantly reduce memory usage and improve performance.

Author: Tomas Vondra
Reviewed-by: Simon Riggs
parent 3bae43ca
...@@ -43,6 +43,12 @@ ...@@ -43,6 +43,12 @@
* transaction there will be no other data carrying records between a row's * transaction there will be no other data carrying records between a row's
* toast chunks and the row data itself. See ReorderBufferToast* for * toast chunks and the row data itself. See ReorderBufferToast* for
* details. * details.
*
* ReorderBuffer uses two special memory context types - SlabContext for
* allocations of fixed-length structures (changes and transactions), and
* GenerationContext for the variable-length transaction data (allocated
* and freed in groups with similar lifespan).
*
* ------------------------------------------------------------------------- * -------------------------------------------------------------------------
*/ */
#include "postgres.h" #include "postgres.h"
...@@ -150,15 +156,6 @@ typedef struct ReorderBufferDiskChange ...@@ -150,15 +156,6 @@ typedef struct ReorderBufferDiskChange
*/ */
static const Size max_changes_in_memory = 4096; static const Size max_changes_in_memory = 4096;
/*
* We use a very simple form of a slab allocator for frequently allocated
* objects, simply keeping a fixed number in a linked list when unused,
* instead pfree()ing them. Without that in many workloads aset.c becomes a
* major bottleneck, especially when spilling to disk while decoding batch
* workloads.
*/
static const Size max_cached_tuplebufs = 4096 * 2; /* ~8MB */
/* --------------------------------------- /* ---------------------------------------
* primary reorderbuffer support routines * primary reorderbuffer support routines
* --------------------------------------- * ---------------------------------------
...@@ -248,6 +245,10 @@ ReorderBufferAllocate(void) ...@@ -248,6 +245,10 @@ ReorderBufferAllocate(void)
SLAB_DEFAULT_BLOCK_SIZE, SLAB_DEFAULT_BLOCK_SIZE,
sizeof(ReorderBufferTXN)); sizeof(ReorderBufferTXN));
buffer->tup_context = GenerationContextCreate(new_ctx,
"Tuples",
SLAB_LARGE_BLOCK_SIZE);
hash_ctl.keysize = sizeof(TransactionId); hash_ctl.keysize = sizeof(TransactionId);
hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt); hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
hash_ctl.hcxt = buffer->context; hash_ctl.hcxt = buffer->context;
...@@ -258,15 +259,12 @@ ReorderBufferAllocate(void) ...@@ -258,15 +259,12 @@ ReorderBufferAllocate(void)
buffer->by_txn_last_xid = InvalidTransactionId; buffer->by_txn_last_xid = InvalidTransactionId;
buffer->by_txn_last_txn = NULL; buffer->by_txn_last_txn = NULL;
buffer->nr_cached_tuplebufs = 0;
buffer->outbuf = NULL; buffer->outbuf = NULL;
buffer->outbufsize = 0; buffer->outbufsize = 0;
buffer->current_restart_decoding_lsn = InvalidXLogRecPtr; buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
dlist_init(&buffer->toplevel_by_lsn); dlist_init(&buffer->toplevel_by_lsn);
slist_init(&buffer->cached_tuplebufs);
return buffer; return buffer;
} }
...@@ -419,42 +417,12 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len) ...@@ -419,42 +417,12 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
alloc_len = tuple_len + SizeofHeapTupleHeader; alloc_len = tuple_len + SizeofHeapTupleHeader;
/*
* Most tuples are below MaxHeapTupleSize, so we use a slab allocator for
* those. Thus always allocate at least MaxHeapTupleSize. Note that tuples
* generated for oldtuples can be bigger, as they don't have out-of-line
* toast columns.
*/
if (alloc_len < MaxHeapTupleSize)
alloc_len = MaxHeapTupleSize;
/* if small enough, check the slab cache */
if (alloc_len <= MaxHeapTupleSize && rb->nr_cached_tuplebufs)
{
rb->nr_cached_tuplebufs--;
tuple = slist_container(ReorderBufferTupleBuf, node,
slist_pop_head_node(&rb->cached_tuplebufs));
Assert(tuple->alloc_tuple_size == MaxHeapTupleSize);
#ifdef USE_ASSERT_CHECKING
memset(&tuple->tuple, 0xa9, sizeof(HeapTupleData));
VALGRIND_MAKE_MEM_UNDEFINED(&tuple->tuple, sizeof(HeapTupleData));
#endif
tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
#ifdef USE_ASSERT_CHECKING
memset(tuple->tuple.t_data, 0xa8, tuple->alloc_tuple_size);
VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
#endif
}
else
{
tuple = (ReorderBufferTupleBuf *) tuple = (ReorderBufferTupleBuf *)
MemoryContextAlloc(rb->context, MemoryContextAlloc(rb->tup_context,
sizeof(ReorderBufferTupleBuf) + sizeof(ReorderBufferTupleBuf) +
MAXIMUM_ALIGNOF + alloc_len); MAXIMUM_ALIGNOF + alloc_len);
tuple->alloc_tuple_size = alloc_len; tuple->alloc_tuple_size = alloc_len;
tuple->tuple.t_data = ReorderBufferTupleBufData(tuple); tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
}
return tuple; return tuple;
} }
...@@ -468,21 +436,7 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len) ...@@ -468,21 +436,7 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
void void
ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple) ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
{ {
/* check whether to put into the slab cache, oversized tuples never are */
if (tuple->alloc_tuple_size == MaxHeapTupleSize &&
rb->nr_cached_tuplebufs < max_cached_tuplebufs)
{
rb->nr_cached_tuplebufs++;
slist_push_head(&rb->cached_tuplebufs, &tuple->node);
VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
VALGRIND_MAKE_MEM_UNDEFINED(tuple, sizeof(ReorderBufferTupleBuf));
VALGRIND_MAKE_MEM_DEFINED(&tuple->node, sizeof(tuple->node));
VALGRIND_MAKE_MEM_DEFINED(&tuple->alloc_tuple_size, sizeof(tuple->alloc_tuple_size));
}
else
{
pfree(tuple); pfree(tuple);
}
} }
/* /*
......
...@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr ...@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
top_builddir = ../../../.. top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global include $(top_builddir)/src/Makefile.global
OBJS = aset.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o slab.o OBJS = aset.o dsa.o freepage.o generation.o mcxt.o memdebug.o portalmem.o slab.o
include $(top_srcdir)/src/backend/common.mk include $(top_srcdir)/src/backend/common.mk
...@@ -431,3 +431,26 @@ will not allocate very much space per tuple cycle. To make this usage ...@@ -431,3 +431,26 @@ will not allocate very much space per tuple cycle. To make this usage
pattern cheap, the first block allocated in a context is not given pattern cheap, the first block allocated in a context is not given
back to malloc() during reset, but just cleared. This avoids malloc back to malloc() during reset, but just cleared. This avoids malloc
thrashing. thrashing.
Alternative Memory Context Implementations
------------------------------------------
aset.c is our default general-purpose implementation, working fine
in most situations. We also have two implementations optimized for
special use cases, providing either better performance or lower memory
usage compared to aset.c (or both).
* slab.c (SlabContext) is designed for allocations of fixed-length
chunks, and does not allow allocations of chunks with different size.
* generation.c (GenerationContext) is designed for cases when chunks
are allocated in groups with similar lifespan (generations), or
roughly in FIFO order.
Both memory contexts aim to free memory back to the operating system
(unlike aset.c, which keeps the freed chunks in a freelist, and only
returns the memory when reset/deleted).
These memory contexts were initially developed for ReorderBuffer, but
may be useful elsewhere as long as the allocation patterns match.
This diff is collapsed.
...@@ -96,6 +96,8 @@ typedef struct MemoryContextData ...@@ -96,6 +96,8 @@ typedef struct MemoryContextData
*/ */
#define MemoryContextIsValid(context) \ #define MemoryContextIsValid(context) \
((context) != NULL && \ ((context) != NULL && \
(IsA((context), AllocSetContext) || IsA((context), SlabContext))) (IsA((context), AllocSetContext) || \
IsA((context), SlabContext) || \
IsA((context), GenerationContext)))
#endif /* MEMNODES_H */ #endif /* MEMNODES_H */
...@@ -274,6 +274,7 @@ typedef enum NodeTag ...@@ -274,6 +274,7 @@ typedef enum NodeTag
T_MemoryContext, T_MemoryContext,
T_AllocSetContext, T_AllocSetContext,
T_SlabContext, T_SlabContext,
T_GenerationContext,
/* /*
* TAGS FOR VALUE NODES (value.h) * TAGS FOR VALUE NODES (value.h)
......
...@@ -344,20 +344,7 @@ struct ReorderBuffer ...@@ -344,20 +344,7 @@ struct ReorderBuffer
*/ */
MemoryContext change_context; MemoryContext change_context;
MemoryContext txn_context; MemoryContext txn_context;
MemoryContext tup_context;
/*
* Data structure slab cache.
*
* We allocate/deallocate some structures very frequently, to avoid bigger
* overhead we cache some unused ones here.
*
* The maximum number of cached entries is controlled by const variables
* on top of reorderbuffer.c
*/
/* cached ReorderBufferTupleBufs */
slist_head cached_tuplebufs;
Size nr_cached_tuplebufs;
XLogRecPtr current_restart_decoding_lsn; XLogRecPtr current_restart_decoding_lsn;
......
...@@ -155,6 +155,11 @@ extern MemoryContext SlabContextCreate(MemoryContext parent, ...@@ -155,6 +155,11 @@ extern MemoryContext SlabContextCreate(MemoryContext parent,
Size blockSize, Size blockSize,
Size chunkSize); Size chunkSize);
/* generation.c */
extern MemoryContext GenerationContextCreate(MemoryContext parent,
const char *name,
Size blockSize);
/* /*
* Recommended default alloc parameters, suitable for "ordinary" contexts * Recommended default alloc parameters, suitable for "ordinary" contexts
* that might hold quite a lot of data. * that might hold quite a lot of data.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment