Commit 3bbd6af3 authored by Tom Lane's avatar Tom Lane

Adjust btbulkdelete logic so that only one WAL record is issued while

deleting multiple index entries on a single index page.  This makes for
a very substantial reduction in the amount of WAL traffic during a
large delete operation.
parent 13dadef8
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.61 2003/02/23 06:17:13 tgl Exp $ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.62 2003/02/23 22:43:08 tgl Exp $
* *
* NOTES * NOTES
* Postgres btree pages look like ordinary relation pages. The opaque * Postgres btree pages look like ordinary relation pages. The opaque
...@@ -618,26 +618,34 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level) ...@@ -618,26 +618,34 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
} }
/* /*
* Delete an item from a btree page. * Delete item(s) from a btree page.
* *
* This must only be used for deleting leaf items. Deleting an item on a * This must only be used for deleting leaf items. Deleting an item on a
* non-leaf page has to be done as part of an atomic action that includes * non-leaf page has to be done as part of an atomic action that includes
* deleting the page it points to. * deleting the page it points to.
* *
* This routine assumes that the caller has pinned and locked the buffer, * This routine assumes that the caller has pinned and locked the buffer,
* and will write the buffer afterwards. * and will write the buffer afterwards. Also, the given itemnos *must*
* appear in increasing order in the array.
*/ */
void void
_bt_itemdel(Relation rel, Buffer buf, ItemPointer tid) _bt_delitems(Relation rel, Buffer buf,
OffsetNumber *itemnos, int nitems)
{ {
Page page = BufferGetPage(buf); Page page = BufferGetPage(buf);
OffsetNumber offno; int i;
offno = ItemPointerGetOffsetNumber(tid);
/* No elog(ERROR) until changes are logged */
START_CRIT_SECTION(); START_CRIT_SECTION();
PageIndexTupleDelete(page, offno); /*
* Delete the items in reverse order so we don't have to think about
* adjusting item numbers for previous deletions.
*/
for (i = nitems - 1; i >= 0; i--)
{
PageIndexTupleDelete(page, itemnos[i]);
}
/* XLOG stuff */ /* XLOG stuff */
if (!rel->rd_istemp) if (!rel->rd_istemp)
...@@ -646,17 +654,30 @@ _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid) ...@@ -646,17 +654,30 @@ _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid)
XLogRecPtr recptr; XLogRecPtr recptr;
XLogRecData rdata[2]; XLogRecData rdata[2];
xlrec.target.node = rel->rd_node; xlrec.node = rel->rd_node;
xlrec.target.tid = *tid; xlrec.block = BufferGetBlockNumber(buf);
rdata[0].buffer = InvalidBuffer; rdata[0].buffer = InvalidBuffer;
rdata[0].data = (char *) &xlrec; rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfBtreeDelete; rdata[0].len = SizeOfBtreeDelete;
rdata[0].next = &(rdata[1]); rdata[0].next = &(rdata[1]);
/*
* The target-offsets array is not in the buffer, but pretend
* that it is. When XLogInsert stores the whole buffer, the offsets
* array need not be stored too.
*/
rdata[1].buffer = buf; rdata[1].buffer = buf;
rdata[1].data = NULL; if (nitems > 0)
rdata[1].len = 0; {
rdata[1].data = (char *) itemnos;
rdata[1].len = nitems * sizeof(OffsetNumber);
}
else
{
rdata[1].data = NULL;
rdata[1].len = 0;
}
rdata[1].next = NULL; rdata[1].next = NULL;
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata); recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* IDENTIFICATION * IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.97 2003/02/23 06:17:13 tgl Exp $ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.98 2003/02/23 22:43:08 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -572,121 +572,110 @@ btbulkdelete(PG_FUNCTION_ARGS) ...@@ -572,121 +572,110 @@ btbulkdelete(PG_FUNCTION_ARGS)
IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
void *callback_state = (void *) PG_GETARG_POINTER(2); void *callback_state = (void *) PG_GETARG_POINTER(2);
IndexBulkDeleteResult *result; IndexBulkDeleteResult *result;
BlockNumber num_pages;
double tuples_removed; double tuples_removed;
double num_index_tuples; double num_index_tuples;
IndexScanDesc scan; OffsetNumber deletable[BLCKSZ / sizeof(OffsetNumber)];
BTScanOpaque so; int ndeletable;
ItemPointer current; Buffer buf;
BlockNumber num_pages;
tuples_removed = 0; tuples_removed = 0;
num_index_tuples = 0; num_index_tuples = 0;
/* /*
* We use a standard IndexScanDesc scan object, but to speed up the * The outer loop iterates over index leaf pages, the inner over items
* loop, we skip most of the wrapper layers of index_getnext and * on a leaf page. We issue just one _bt_delitems() call per page,
* instead call _bt_step directly. This implies holding buffer lock * so as to minimize WAL traffic.
* on a target page throughout the loop over the page's tuples.
*
* Whenever we step onto a new page, we have to trade in the read
* lock acquired by _bt_first or _bt_step for an exclusive write lock
* (fortunately, _bt_relbuf doesn't care which kind of lock it's
* releasing when it comes time for _bt_step to release our lock).
* *
* Note that we exclusive-lock every leaf page, or at least every one * Note that we exclusive-lock every leaf page containing data items,
* containing data items. It sounds attractive to only exclusive-lock * in sequence left to right. It sounds attractive to only exclusive-lock
* those containing items we need to delete, but unfortunately that * those containing items we need to delete, but unfortunately that
* is not safe: we could then pass a stopped indexscan, which could * is not safe: we could then pass a stopped indexscan, which could
* in rare cases lead to deleting the item it needs to find when it * in rare cases lead to deleting the item it needs to find when it
* resumes. (See _bt_restscan --- this could only happen if an indexscan * resumes. (See _bt_restscan --- this could only happen if an indexscan
* stops on a deletable item and then a page split moves that item * stops on a deletable item and then a page split moves that item
* into a page further to its right, which the indexscan will have no * into a page further to its right, which the indexscan will have no
* pin on.) * pin on.) We can skip obtaining exclusive lock on empty pages
* though, since no indexscan could be stopped on those.
*/ */
scan = index_beginscan(NULL, rel, SnapshotAny, 0, (ScanKey) NULL); buf = _bt_get_endpoint(rel, 0, false);
so = (BTScanOpaque) scan->opaque; if (BufferIsValid(buf)) /* check for empty index */
current = &(scan->currentItemData);
/* Use _bt_first to get started, then _bt_step to remaining tuples */
if (_bt_first(scan, ForwardScanDirection))
{ {
Buffer buf; for (;;)
BlockNumber lockedBlock = InvalidBlockNumber;
/* we have the buffer pinned and read-locked */
buf = so->btso_curbuf;
Assert(BufferIsValid(buf));
do
{ {
Page page; Page page;
BlockNumber blkno;
OffsetNumber offnum;
BTItem btitem;
BTPageOpaque opaque; BTPageOpaque opaque;
IndexTuple itup; OffsetNumber offnum,
ItemPointer htup; minoff,
maxoff;
BlockNumber nextpage;
CHECK_FOR_INTERRUPTS(); CHECK_FOR_INTERRUPTS();
/* current is the next index tuple */ ndeletable = 0;
page = BufferGetPage(buf); page = BufferGetPage(buf);
blkno = ItemPointerGetBlockNumber(current); opaque = (BTPageOpaque) PageGetSpecialPointer(page);
minoff = P_FIRSTDATAKEY(opaque);
/* maxoff = PageGetMaxOffsetNumber(page);
* Make sure we have a super-exclusive write lock on this page. /* We probably cannot see deleted pages, but skip 'em if so */
* if (minoff <= maxoff && !P_ISDELETED(opaque))
* We assume that only concurrent insertions, not deletions,
* can occur while we're not holding the page lock (the
* caller should hold a suitable relation lock to ensure
* this). Therefore, no items can escape being scanned because
* of this temporary lock release.
*/
if (blkno != lockedBlock)
{ {
/*
* Trade in the initial read lock for a super-exclusive
* write lock on this page.
*/
LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBuffer(buf, BUFFER_LOCK_UNLOCK);
LockBufferForCleanup(buf); LockBufferForCleanup(buf);
lockedBlock = blkno;
/* /*
* If the page was formerly rightmost but was split while we * Recompute minoff/maxoff, both of which could have changed
* didn't hold the lock, and ip_posid is pointing to item * while we weren't holding the lock.
* 1, then ip_posid now points at the high key not a valid
* data item. In this case we need to step forward.
*/ */
opaque = (BTPageOpaque) PageGetSpecialPointer(page); minoff = P_FIRSTDATAKEY(opaque);
if (current->ip_posid < P_FIRSTDATAKEY(opaque)) maxoff = PageGetMaxOffsetNumber(page);
current->ip_posid = P_FIRSTDATAKEY(opaque);
}
offnum = ItemPointerGetOffsetNumber(current);
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
itup = &btitem->bti_itup;
htup = &(itup->t_tid);
if (callback(htup, callback_state))
{
/* Okay to delete the item from the page */
_bt_itemdel(rel, buf, current);
/* Mark buffer dirty, but keep the lock and pin */
WriteNoReleaseBuffer(buf);
tuples_removed += 1;
/* /*
* We now need to back up the scan one item, so that the next * Scan over all items to see which ones need deleted
* cycle will re-examine the same offnum on this page (which * according to the callback function.
* now holds the next item).
*/ */
current->ip_posid--; for (offnum = minoff;
offnum <= maxoff;
offnum = OffsetNumberNext(offnum))
{
BTItem btitem;
ItemPointer htup;
btitem = (BTItem) PageGetItem(page,
PageGetItemId(page, offnum));
htup = &(btitem->bti_itup.t_tid);
if (callback(htup, callback_state))
{
deletable[ndeletable++] = offnum;
tuples_removed += 1;
}
else
num_index_tuples += 1;
}
}
/*
* If we need to delete anything, do it and write the buffer;
* else just release the buffer.
*/
nextpage = opaque->btpo_next;
if (ndeletable > 0)
{
_bt_delitems(rel, buf, deletable, ndeletable);
_bt_wrtbuf(rel, buf);
} }
else else
num_index_tuples += 1; {
} while (_bt_step(scan, &buf, ForwardScanDirection)); _bt_relbuf(rel, buf);
}
/* And advance to next page, if any */
if (nextpage == P_NONE)
break;
buf = _bt_getbuf(rel, nextpage, BT_READ);
}
} }
index_endscan(scan);
/* return statistics */ /* return statistics */
num_pages = RelationGetNumberOfBlocks(rel); num_pages = RelationGetNumberOfBlocks(rel);
...@@ -765,7 +754,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS) ...@@ -765,7 +754,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
} }
} }
else if ((opaque->btpo_flags & BTP_HALF_DEAD) || else if ((opaque->btpo_flags & BTP_HALF_DEAD) ||
P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)) P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page))
{ {
/* Empty, try to delete */ /* Empty, try to delete */
int ndel; int ndel;
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* IDENTIFICATION * IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.2 2003/02/23 06:17:13 tgl Exp $ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.3 2003/02/23 22:43:08 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -379,11 +379,10 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record) ...@@ -379,11 +379,10 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
return; return;
xlrec = (xl_btree_delete *) XLogRecGetData(record); xlrec = (xl_btree_delete *) XLogRecGetData(record);
reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node); reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
if (!RelationIsValid(reln)) if (!RelationIsValid(reln))
return; return;
buffer = XLogReadBuffer(false, reln, buffer = XLogReadBuffer(false, reln, xlrec->block);
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
if (!BufferIsValid(buffer)) if (!BufferIsValid(buffer))
elog(PANIC, "btree_delete_redo: block unfound"); elog(PANIC, "btree_delete_redo: block unfound");
page = (Page) BufferGetPage(buffer); page = (Page) BufferGetPage(buffer);
...@@ -396,7 +395,21 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record) ...@@ -396,7 +395,21 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
return; return;
} }
PageIndexTupleDelete(page, ItemPointerGetOffsetNumber(&(xlrec->target.tid))); if (record->xl_len > SizeOfBtreeDelete)
{
OffsetNumber *unused;
OffsetNumber *unend;
unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
/* be careful to delete from back to front */
while (unused < unend)
{
unend--;
PageIndexTupleDelete(page, *unend);
}
}
PageSetLSN(page, lsn); PageSetLSN(page, lsn);
PageSetSUI(page, ThisStartUpID); PageSetSUI(page, ThisStartUpID);
...@@ -853,8 +866,8 @@ btree_desc(char *buf, uint8 xl_info, char *rec) ...@@ -853,8 +866,8 @@ btree_desc(char *buf, uint8 xl_info, char *rec)
{ {
xl_btree_delete *xlrec = (xl_btree_delete *) rec; xl_btree_delete *xlrec = (xl_btree_delete *) rec;
strcat(buf, "delete: "); sprintf(buf + strlen(buf), "delete: node %u/%u; blk %u",
out_target(buf, &(xlrec->target)); xlrec->node.tblNode, xlrec->node.relNode, xlrec->block);
break; break;
} }
case XLOG_BTREE_DELETE_PAGE: case XLOG_BTREE_DELETE_PAGE:
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $Id: nbtree.h,v 1.66 2003/02/23 06:17:13 tgl Exp $ * $Id: nbtree.h,v 1.67 2003/02/23 22:43:09 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -263,14 +263,18 @@ typedef struct xl_btree_split ...@@ -263,14 +263,18 @@ typedef struct xl_btree_split
#define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16)) #define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16))
/* /*
* This is what we need to know about delete of an individual leaf btitem * This is what we need to know about delete of individual leaf btitems.
* The WAL record can represent deletion of any number of btitems on a
* single index page.
*/ */
typedef struct xl_btree_delete typedef struct xl_btree_delete
{ {
xl_btreetid target; /* deleted tuple id */ RelFileNode node;
BlockNumber block;
/* TARGET OFFSET NUMBERS FOLLOW AT THE END */
} xl_btree_delete; } xl_btree_delete;
#define SizeOfBtreeDelete (offsetof(xl_btreetid, tid) + SizeOfIptrData) #define SizeOfBtreeDelete (offsetof(xl_btree_delete, block) + sizeof(BlockNumber))
/* /*
* This is what we need to know about deletion of a btree page. The target * This is what we need to know about deletion of a btree page. The target
...@@ -453,7 +457,8 @@ extern void _bt_wrtnorelbuf(Relation rel, Buffer buf); ...@@ -453,7 +457,8 @@ extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
extern void _bt_pageinit(Page page, Size size); extern void _bt_pageinit(Page page, Size size);
extern bool _bt_page_recyclable(Page page); extern bool _bt_page_recyclable(Page page);
extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level); extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level);
extern void _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid); extern void _bt_delitems(Relation rel, Buffer buf,
OffsetNumber *itemnos, int nitems);
extern int _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full); extern int _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full);
/* /*
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment