Commit 503c7305 authored by Robert Haas's avatar Robert Haas

Make the visibility map crash-safe.

This involves two main changes from the previous behavior.  First,
when we set a bit in the visibility map, emit a new WAL record of type
XLOG_HEAP2_VISIBLE.  Replay sets the page-level PD_ALL_VISIBLE bit and
the visibility map bit.  Second, when inserting, updating, or deleting
a tuple, we can no longer get away with clearing the visibility map
bit after releasing the lock on the corresponding heap page, because
an intervening crash might leave the visibility map bit set and the
page-level bit clear.  Making this work requires a bit of interface
refactoring.

In passing, a few minor but related cleanups: change the test in
visibilitymap_set and visibilitymap_clear to throw an error if the
wrong page (or no page) is pinned, rather than silently doing nothing;
this case should never occur.  Also, remove duplicate definitions of
InvalidXLogRecPtr.

Patch by me, review by Noah Misch.
parent 431ab0e8
This diff is collapsed.
......@@ -17,6 +17,7 @@
#include "access/heapam.h"
#include "access/hio.h"
#include "access/visibilitymap.h"
#include "storage/bufmgr.h"
#include "storage/freespace.h"
#include "storage/lmgr.h"
......@@ -150,7 +151,8 @@ ReadBufferBI(Relation relation, BlockNumber targetBlock,
Buffer
RelationGetBufferForTuple(Relation relation, Size len,
Buffer otherBuffer, int options,
struct BulkInsertStateData * bistate)
struct BulkInsertStateData * bistate,
Buffer *vmbuffer)
{
bool use_fsm = !(options & HEAP_INSERT_SKIP_FSM);
Buffer buffer = InvalidBuffer;
......@@ -237,23 +239,37 @@ RelationGetBufferForTuple(Relation relation, Size len,
* Read and exclusive-lock the target block, as well as the other
* block if one was given, taking suitable care with lock ordering and
* the possibility they are the same block.
*
* If the page-level all-visible flag is set, caller will need to clear
* both that and the corresponding visibility map bit. However, by the
* time we return, we'll have x-locked the buffer, and we don't want to
* do any I/O while in that state. So we check the bit here before
* taking the lock, and pin the page if it appears necessary.
* Checking without the lock creates a risk of getting the wrong
* answer, so we'll have to recheck after acquiring the lock.
*/
if (otherBuffer == InvalidBuffer)
{
/* easy case */
buffer = ReadBufferBI(relation, targetBlock, bistate);
if (PageIsAllVisible(BufferGetPage(buffer)))
visibilitymap_pin(relation, targetBlock, vmbuffer);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
}
else if (otherBlock == targetBlock)
{
/* also easy case */
buffer = otherBuffer;
if (PageIsAllVisible(BufferGetPage(buffer)))
visibilitymap_pin(relation, targetBlock, vmbuffer);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
}
else if (otherBlock < targetBlock)
{
/* lock other buffer first */
buffer = ReadBuffer(relation, targetBlock);
if (PageIsAllVisible(BufferGetPage(buffer)))
visibilitymap_pin(relation, targetBlock, vmbuffer);
LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
}
......@@ -261,10 +277,40 @@ RelationGetBufferForTuple(Relation relation, Size len,
{
/* lock target buffer first */
buffer = ReadBuffer(relation, targetBlock);
if (PageIsAllVisible(BufferGetPage(buffer)))
visibilitymap_pin(relation, targetBlock, vmbuffer);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
}
/*
* If the page is all visible but we don't have the right visibility
* map page pinned, then give up our locks, go get the pin, and
* re-lock. This is pretty painful, but hopefully shouldn't happen
* often. Note that there's a small possibility that we didn't pin
* the page above but still have the correct page pinned anyway, either
* because we've already made a previous pass through this loop, or
* because caller passed us the right page anyway.
*
* Note also that it's possible that by the time we get the pin and
* retake the buffer locks, the visibility map bit will have been
* cleared by some other backend anyway. In that case, we'll have done
* a bit of extra work for no gain, but there's no real harm done.
*/
if (PageIsAllVisible(BufferGetPage(buffer))
&& !visibilitymap_pin_ok(targetBlock, *vmbuffer))
{
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
if (otherBlock != targetBlock)
LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
visibilitymap_pin(relation, targetBlock, vmbuffer);
if (otherBuffer != InvalidBuffer && otherBlock < targetBlock)
LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
if (otherBuffer != InvalidBuffer && otherBlock > targetBlock)
LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
}
/*
* Now we can check to see if there's enough free space here. If so,
* we're done.
......
......@@ -11,10 +11,11 @@
* src/backend/access/heap/visibilitymap.c
*
* INTERFACE ROUTINES
* visibilitymap_clear - clear a bit in the visibility map
* visibilitymap_pin - pin a map page for setting a bit
* visibilitymap_set - set a bit in a previously pinned page
* visibilitymap_test - test if a bit is set
* visibilitymap_clear - clear a bit in the visibility map
* visibilitymap_pin - pin a map page for setting a bit
* visibilitymap_pin_ok - check whether correct map page is already pinned
* visibilitymap_set - set a bit in a previously pinned page
* visibilitymap_test - test if a bit is set
*
* NOTES
*
......@@ -64,32 +65,13 @@
* It would be nice to use the visibility map to skip visibility checks in
* index scans.
*
* Currently, the visibility map is not 100% correct all the time.
* During updates, the bit in the visibility map is cleared after releasing
* the lock on the heap page. During the window between releasing the lock
* and clearing the bit in the visibility map, the bit in the visibility map
* is set, but the new insertion or deletion is not yet visible to other
* backends.
*
* That might actually be OK for the index scans, though. The newly inserted
* tuple wouldn't have an index pointer yet, so all tuples reachable from an
* index would still be visible to all other backends, and deletions wouldn't
* be visible to other backends yet. (But HOT breaks that argument, no?)
*
* There's another hole in the way the PD_ALL_VISIBLE flag is set. When
* vacuum observes that all tuples are visible to all, it sets the flag on
* the heap page, and also sets the bit in the visibility map. If we then
* crash, and only the visibility map page was flushed to disk, we'll have
* a bit set in the visibility map, but the corresponding flag on the heap
* page is not set. If the heap page is then updated, the updater won't
* know to clear the bit in the visibility map. (Isn't that prevented by
* the LSN interlock?)
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/heapam.h"
#include "access/visibilitymap.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/bufpage.h"
#include "storage/lmgr.h"
......@@ -127,38 +109,37 @@ static void vm_extend(Relation rel, BlockNumber nvmblocks);
/*
* visibilitymap_clear - clear a bit in visibility map
*
* Clear a bit in the visibility map, marking that not all tuples are
* visible to all transactions anymore.
* You must pass a buffer containing the correct map page to this function.
* Call visibilitymap_pin first to pin the right one. This function doesn't do
* any I/O.
*/
void
visibilitymap_clear(Relation rel, BlockNumber heapBlk)
visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf)
{
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
int mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
uint8 mask = 1 << mapBit;
Buffer mapBuffer;
char *map;
#ifdef TRACE_VISIBILITYMAP
elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk);
#endif
mapBuffer = vm_readbuf(rel, mapBlock, false);
if (!BufferIsValid(mapBuffer))
return; /* nothing to do */
if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock)
elog(ERROR, "wrong buffer passed to visibilitymap_clear");
LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE);
map = PageGetContents(BufferGetPage(mapBuffer));
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
map = PageGetContents(BufferGetPage(buf));
if (map[mapByte] & mask)
{
map[mapByte] &= ~mask;
MarkBufferDirty(mapBuffer);
MarkBufferDirty(buf);
}
UnlockReleaseBuffer(mapBuffer);
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
/*
......@@ -193,20 +174,37 @@ visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *buf)
*buf = vm_readbuf(rel, mapBlock, true);
}
/*
* visibilitymap_pin_ok - do we already have the correct page pinned?
*
* On entry, buf should be InvalidBuffer or a valid buffer returned by
* an earlier call to visibilitymap_pin or visibilitymap_test on the same
* relation. The return value indicates whether the buffer covers the
* given heapBlk.
*/
bool
visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf)
{
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
return BufferIsValid(buf) && BufferGetBlockNumber(buf) == mapBlock;
}
/*
* visibilitymap_set - set a bit on a previously pinned page
*
* recptr is the LSN of the heap page. The LSN of the visibility map page is
* advanced to that, to make sure that the visibility map doesn't get flushed
* to disk before the update to the heap page that made all tuples visible.
* recptr is the LSN of the XLOG record we're replaying, if we're in recovery,
* or InvalidXLogRecPtr in normal running. The page LSN is advanced to the
* one provided; in normal running, we generate a new XLOG record and set the
* page LSN to that value.
*
* This is an opportunistic function. It does nothing, unless *buf
* contains the bit for heapBlk. Call visibilitymap_pin first to pin
* the right map page. This function doesn't do any I/O.
* You must pass a buffer containing the correct map page to this function.
* Call visibilitymap_pin first to pin the right one. This function doesn't do
* any I/O.
*/
void
visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr,
Buffer *buf)
Buffer buf)
{
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
......@@ -218,25 +216,35 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr,
elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk);
#endif
Assert(InRecovery || XLogRecPtrIsInvalid(recptr));
/* Check that we have the right page pinned */
if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != mapBlock)
return;
if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock)
elog(ERROR, "wrong buffer passed to visibilitymap_set");
page = BufferGetPage(*buf);
page = BufferGetPage(buf);
map = PageGetContents(page);
LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
if (!(map[mapByte] & (1 << mapBit)))
{
START_CRIT_SECTION();
map[mapByte] |= (1 << mapBit);
MarkBufferDirty(buf);
if (XLByteLT(PageGetLSN(page), recptr))
if (RelationNeedsWAL(rel))
{
if (XLogRecPtrIsInvalid(recptr))
recptr = log_heap_visible(rel->rd_node, heapBlk, buf);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(*buf);
PageSetTLI(page, ThisTimeLineID);
}
END_CRIT_SECTION();
}
LockBuffer(*buf, BUFFER_LOCK_UNLOCK);
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
/*
......
......@@ -24,6 +24,8 @@
#include "access/transam.h"
#include "utils/snapmgr.h"
/* Handy constant for an invalid xlog recptr */
const XLogRecPtr InvalidXLogRecPtr = {0, 0};
/*
* Single-item cache for results of TransactionLogFetch. It's worth having
......@@ -35,9 +37,6 @@ static TransactionId cachedFetchXid = InvalidTransactionId;
static XidStatus cachedFetchXidStatus;
static XLogRecPtr cachedCommitLSN;
/* Handy constant for an invalid xlog recptr */
static const XLogRecPtr InvalidXLogRecPtr = {0, 0};
/* Local functions */
static XidStatus TransactionLogFetch(TransactionId transactionId);
......
......@@ -5462,7 +5462,6 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
{
char recoveryPath[MAXPGPATH];
char xlogpath[MAXPGPATH];
XLogRecPtr InvalidXLogRecPtr = {0, 0};
/*
* We are no longer in archive recovery state.
......@@ -8069,8 +8068,6 @@ CreateRestartPoint(int flags)
if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
{
XLogRecPtr InvalidXLogRecPtr = {0, 0};
ereport(DEBUG2,
(errmsg("skipping restartpoint, already performed at %X/%X",
lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
......
......@@ -513,7 +513,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
visibilitymap_pin(onerel, blkno, &vmbuffer);
LockBuffer(buf, BUFFER_LOCK_SHARE);
if (PageIsAllVisible(page))
visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer);
visibilitymap_set(onerel, blkno, InvalidXLogRecPtr,
vmbuffer);
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
......@@ -765,7 +766,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
* updating the visibility map, but since this case shouldn't
* happen anyway, don't worry about that.
*/
visibilitymap_clear(onerel, blkno);
visibilitymap_pin(onerel, blkno, &vmbuffer);
visibilitymap_clear(onerel, blkno, vmbuffer);
}
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
......@@ -776,7 +778,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
visibilitymap_pin(onerel, blkno, &vmbuffer);
LockBuffer(buf, BUFFER_LOCK_SHARE);
if (PageIsAllVisible(page))
visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer);
visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer);
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
......
......@@ -136,6 +136,8 @@ extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
TransactionId cutoff_xid,
OffsetNumber *offsets, int offcnt);
extern XLogRecPtr log_heap_visible(RelFileNode rnode, BlockNumber block,
Buffer vm_buffer);
extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum,
BlockNumber blk, Page page);
......
......@@ -38,6 +38,7 @@ extern void RelationPutHeapTuple(Relation relation, Buffer buffer,
HeapTuple tuple);
extern Buffer RelationGetBufferForTuple(Relation relation, Size len,
Buffer otherBuffer, int options,
struct BulkInsertStateData * bistate);
struct BulkInsertStateData * bistate,
Buffer *vmbuffer);
#endif /* HIO_H */
......@@ -606,6 +606,7 @@ typedef HeapTupleData *HeapTuple;
#define XLOG_HEAP2_CLEAN 0x10
/* 0x20 is free, was XLOG_HEAP2_CLEAN_MOVE */
#define XLOG_HEAP2_CLEANUP_INFO 0x30
#define XLOG_HEAP2_VISIBLE 0x40
/*
* All what we need to find changed tuple
......@@ -750,6 +751,15 @@ typedef struct xl_heap_freeze
#define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_xid) + sizeof(TransactionId))
/* This is what we need to know about setting a visibility map bit */
typedef struct xl_heap_visible
{
RelFileNode node;
BlockNumber block;
} xl_heap_visible;
#define SizeOfHeapVisible (offsetof(xl_heap_visible, block) + sizeof(BlockNumber))
extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
TransactionId *latestRemovedXid);
......
......@@ -135,6 +135,9 @@ extern bool TransactionStartedDuringRecovery(void);
/* in transam/varsup.c */
extern PGDLLIMPORT VariableCache ShmemVariableCache;
/* in transam/transam.c */
extern const XLogRecPtr InvalidXLogRecPtr;
/*
* prototypes for functions in transam/transam.c
......
......@@ -19,11 +19,13 @@
#include "storage/buf.h"
#include "utils/relcache.h"
extern void visibilitymap_clear(Relation rel, BlockNumber heapBlk);
extern void visibilitymap_clear(Relation rel, BlockNumber heapBlk,
Buffer vmbuf);
extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk,
Buffer *vmbuf);
extern bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf);
extern void visibilitymap_set(Relation rel, BlockNumber heapBlk,
XLogRecPtr recptr, Buffer *vmbuf);
XLogRecPtr recptr, Buffer vmbuf);
extern bool visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *vmbuf);
extern void visibilitymap_truncate(Relation rel, BlockNumber heapblk);
......
......@@ -71,7 +71,7 @@ typedef struct XLogContRecord
/*
* Each page of XLOG file has a header like this:
*/
#define XLOG_PAGE_MAGIC 0xD066 /* can be used as WAL version indicator */
#define XLOG_PAGE_MAGIC 0xD067 /* can be used as WAL version indicator */
typedef struct XLogPageHeaderData
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment