Make large sequential scans and VACUUMs work in a limited-size "ring" of

buffers, rather than blowing out the whole shared-buffer arena. Aside from avoiding cache spoliation, this fixes the problem that VACUUM formerly tended to cause a WAL flush for every page it modified, because we had it hacked to use only a single buffer. Those flushes will now occur only once per ring-ful. The exact ring size, and the threshold for seqscans to switch into the ring usage pattern, remain under debate; but the infrastructure seems done. The key bit of infrastructure is a new optional BufferAccessStrategy object that can be passed to ReadBuffer operations; this replaces the former StrategyHintVacuum API. This patch also changes the buffer usage-count methodology a bit: we now advance usage_count when first pinning a buffer, rather than when last unpinning it. To preserve the behavior that a buffer's lifetime starts to decrease when it's released, the clock sweep code is modified to not decrement usage_count of pinned buffers. Work not done in this commit: teach GiST and GIN indexes to use the vacuum BufferAccessStrategy for vacuum-driven fetches. Original patch by Simon, reworked by Heikki and again by Tom.

Make large sequential scans and VACUUMs work in a limited-size "ring" of
buffers, rather than blowing out the whole shared-buffer arena. Aside from avoiding cache spoliation, this fixes the problem that VACUUM formerly tended to cause a WAL flush for every page it modified, because we had it hacked to use only a single buffer. Those flushes will now occur only once per ring-ful. The exact ring size, and the threshold for seqscans to switch into the ring usage pattern, remain under debate; but the infrastructure seems done. The key bit of infrastructure is a new optional BufferAccessStrategy object that can be passed to ReadBuffer operations; this replaces the former StrategyHintVacuum API. This patch also changes the buffer usage-count methodology a bit: we now advance usage_count when first pinning a buffer, rather than when last unpinning it. To preserve the behavior that a buffer's lifetime starts to decrease when it's released, the clock sweep code is modified to not decrement usage_count of pinned buffers. Work not done in this commit: teach GiST and GIN indexes to use the vacuum BufferAccessStrategy for vacuum-driven fetches. Original patch by Simon, reworked by Heikki and again by Tom.
d526575f · Tom Lane · 0a6f2ee8 · d526575f · d526575f · d526575f
Commit d526575f authored May 30, 2007 by Tom Lane
24 changed files
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.94 2007/05/03 16:45:58 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.95 2007/05/30 20:11:51 tgl Exp $
 *
 * NOTES
 *	  This file contains only the public interface routines.
@@ -547,8 +547,9 @@ loop_top:
 			vacuum_delay_point();
-			buf = _hash_getbuf(rel, blkno, HASH_WRITE,
+			buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
-							   LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+											 LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
+											 info->strategy);
 			page = BufferGetPage(buf);
 			opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 			Assert(opaque->hasho_bucket == cur_bucket);
@@ -596,7 +597,8 @@ loop_top:
 		/* If we deleted anything, try to compact free space */
 		if (bucket_dirty)
-			_hash_squeezebucket(rel, cur_bucket, bucket_blkno);
+			_hash_squeezebucket(rel, cur_bucket, bucket_blkno,
+								info->strategy);
 		/* Release bucket lock */
 		_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);

--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.57 2007/05/03 16:45:58 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.58 2007/05/30 20:11:51 tgl Exp $
 *
 * NOTES
 *	  Overflow pages look like ordinary relation pages.
@@ -362,6 +362,9 @@ _hash_firstfreebit(uint32 map)
 *	Remove this overflow page from its bucket's chain, and mark the page as
 *	free.  On entry, ovflbuf is write-locked; it is released before exiting.
 *
+ *	Since this function is invoked in VACUUM, we provide an access strategy
+ *	parameter that controls fetches of the bucket pages.
+ *
 *	Returns the block number of the page that followed the given page
 *	in the bucket, or InvalidBlockNumber if no following page.
 *
@@ -370,7 +373,8 @@ _hash_firstfreebit(uint32 map)
 *	on the bucket, too.
 */
 BlockNumber
-_hash_freeovflpage(Relation rel, Buffer ovflbuf)
+_hash_freeovflpage(Relation rel, Buffer ovflbuf,
+				   BufferAccessStrategy bstrategy)
 {
 	HashMetaPage metap;
 	Buffer		metabuf;
@@ -413,8 +417,11 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
 	 */
 	if (BlockNumberIsValid(prevblkno))
 	{
-		Buffer		prevbuf = _hash_getbuf(rel, prevblkno, HASH_WRITE,
+		Buffer		prevbuf = _hash_getbuf_with_strategy(rel,
-										   LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+														 prevblkno,
+														 HASH_WRITE,
+														 LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
+														 bstrategy);
 		Page		prevpage = BufferGetPage(prevbuf);
 		HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);
@@ -424,8 +431,11 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
 	}
 	if (BlockNumberIsValid(nextblkno))
 	{
-		Buffer		nextbuf = _hash_getbuf(rel, nextblkno, HASH_WRITE,
+		Buffer		nextbuf = _hash_getbuf_with_strategy(rel,
-										   LH_OVERFLOW_PAGE);
+														 nextblkno,
+														 HASH_WRITE,
+														 LH_OVERFLOW_PAGE,
+														 bstrategy);
 		Page		nextpage = BufferGetPage(nextbuf);
 		HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);
@@ -434,6 +444,8 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
 		_hash_wrtbuf(rel, nextbuf);
 	}
+	/* Note: bstrategy is intentionally not used for metapage and bitmap */
 	/* Read the metapage so we can determine which bitmap page to use */
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
 	metap = (HashMetaPage) BufferGetPage(metabuf);
@@ -558,11 +570,15 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
 *
 *	Caller must hold exclusive lock on the target bucket.  This allows
 *	us to safely lock multiple pages in the bucket.
+ *
+ *	Since this function is invoked in VACUUM, we provide an access strategy
+ *	parameter that controls fetches of the bucket pages.
 */
 void
 _hash_squeezebucket(Relation rel,
 					Bucket bucket,
-					BlockNumber bucket_blkno)
+					BlockNumber bucket_blkno,
+					BufferAccessStrategy bstrategy)
 {
 	Buffer		wbuf;
 	Buffer		rbuf = 0;
@@ -581,7 +597,11 @@ _hash_squeezebucket(Relation rel,
 	 * start squeezing into the base bucket page.
 	 */
 	wblkno = bucket_blkno;
-	wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE, LH_BUCKET_PAGE);
+	wbuf = _hash_getbuf_with_strategy(rel,
+									  wblkno,
+									  HASH_WRITE,
+									  LH_BUCKET_PAGE,
+									  bstrategy);
 	wpage = BufferGetPage(wbuf);
 	wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
@@ -595,8 +615,10 @@ _hash_squeezebucket(Relation rel,
 	}
 	/*
-	 * find the last page in the bucket chain by starting at the base bucket
+	 * Find the last page in the bucket chain by starting at the base bucket
-	 * page and working forward.
+	 * page and working forward.  Note: we assume that a hash bucket chain is
+	 * usually smaller than the buffer ring being used by VACUUM, else using
+	 * the access strategy here would be counterproductive.
 	 */
 	ropaque = wopaque;
 	do
@@ -604,7 +626,11 @@ _hash_squeezebucket(Relation rel,
 		rblkno = ropaque->hasho_nextblkno;
 		if (ropaque != wopaque)
 			_hash_relbuf(rel, rbuf);
-		rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
+		rbuf = _hash_getbuf_with_strategy(rel,
+										  rblkno,
+										  HASH_WRITE,
+										  LH_OVERFLOW_PAGE,
+										  bstrategy);
 		rpage = BufferGetPage(rbuf);
 		ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
 		Assert(ropaque->hasho_bucket == bucket);
@@ -644,7 +670,11 @@ _hash_squeezebucket(Relation rel,
 					return;
 				}
-				wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
+				wbuf = _hash_getbuf_with_strategy(rel,
+												  wblkno,
+												  HASH_WRITE,
+												  LH_OVERFLOW_PAGE,
+												  bstrategy);
 				wpage = BufferGetPage(wbuf);
 				wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
 				Assert(wopaque->hasho_bucket == bucket);
@@ -688,15 +718,19 @@ _hash_squeezebucket(Relation rel,
 				/* yes, so release wbuf lock first */
 				_hash_wrtbuf(rel, wbuf);
 				/* free this overflow page (releases rbuf) */
-				_hash_freeovflpage(rel, rbuf);
+				_hash_freeovflpage(rel, rbuf, bstrategy);
 				/* done */
 				return;
 			}
 			/* free this overflow page, then get the previous one */
-			_hash_freeovflpage(rel, rbuf);
+			_hash_freeovflpage(rel, rbuf, bstrategy);
-			rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
+			rbuf = _hash_getbuf_with_strategy(rel,
+											  rblkno,
+											  HASH_WRITE,
+											  LH_OVERFLOW_PAGE,
+											  bstrategy);
 			rpage = BufferGetPage(rbuf);
 			ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
 			Assert(ropaque->hasho_bucket == bucket);

--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.67 2007/05/03 16:45:58 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.68 2007/05/30 20:11:51 tgl Exp $
 *
 * NOTES
 *	  Postgres hash pages look like ordinary relation pages.  The opaque
@@ -214,6 +214,34 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno)
 	return buf;
 }
+/*
+ *	_hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy.
+ *
+ *		This is identical to _hash_getbuf() but also allows a buffer access
+ *		strategy to be specified.  We use this for VACUUM operations.
+ */
+Buffer
+_hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
+						   int access, int flags,
+						   BufferAccessStrategy bstrategy)
+{
+	Buffer		buf;
+	if (blkno == P_NEW)
+		elog(ERROR, "hash AM does not use P_NEW");
+	buf = ReadBufferWithStrategy(rel, blkno, bstrategy);
+	if (access != HASH_NOLOCK)
+		LockBuffer(buf, access);
+	/* ref count and lock type are correct */
+	_hash_checkpage(rel, buf, flags);
+	return buf;
+}
 /*
 *	_hash_relbuf() -- release a locked buffer.
 *
@@ -840,5 +868,5 @@ _hash_splitbucket(Relation rel,
 	_hash_wrtbuf(rel, obuf);
 	_hash_wrtbuf(rel, nbuf);
-	_hash_squeezebucket(rel, obucket, start_oblkno);
+	_hash_squeezebucket(rel, obucket, start_oblkno, NULL);
 }
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.233 2007/05/27 03:50:38 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.234 2007/05/30 20:11:53 tgl Exp $
 *
 *
 * INTERFACE ROUTINES
@@ -83,6 +83,24 @@ initscan(HeapScanDesc scan, ScanKey key)
 	 */
 	scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
+	/*
+	 * If the table is large relative to NBuffers, use a bulk-read access
+	 * strategy, else use the default random-access strategy.  During a
+	 * rescan, don't make a new strategy object if we don't have to.
+	 */
+	if (scan->rs_nblocks > NBuffers / 4 &&
+		!scan->rs_rd->rd_istemp)
+	{
+		if (scan->rs_strategy == NULL)
+			scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
+	}
+	else
+	{
+		if (scan->rs_strategy != NULL)
+			FreeAccessStrategy(scan->rs_strategy);
+		scan->rs_strategy = NULL;
+	}
 	scan->rs_inited = false;
 	scan->rs_ctup.t_data = NULL;
 	ItemPointerSetInvalid(&scan->rs_ctup.t_self);
@@ -123,9 +141,17 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
 	Assert(page < scan->rs_nblocks);
-	scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
+	/* release previous scan buffer, if any */
-										 scan->rs_rd,
+	if (BufferIsValid(scan->rs_cbuf))
-										 page);
+	{
+		ReleaseBuffer(scan->rs_cbuf);
+		scan->rs_cbuf = InvalidBuffer;
+	}
+	/* read page using selected strategy */
+	scan->rs_cbuf = ReadBufferWithStrategy(scan->rs_rd,
+										   page,
+										   scan->rs_strategy);
 	scan->rs_cblock = page;
 	if (!scan->rs_pageatatime)
@@ -938,6 +964,7 @@ heap_beginscan(Relation relation, Snapshot snapshot,
 	scan->rs_rd = relation;
 	scan->rs_snapshot = snapshot;
 	scan->rs_nkeys = nkeys;
+	scan->rs_strategy = NULL;	/* set in initscan */
 	/*
 	 * we can use page-at-a-time mode if it's an MVCC-safe snapshot
@@ -1007,6 +1034,9 @@ heap_endscan(HeapScanDesc scan)
 	if (scan->rs_key)
 		pfree(scan->rs_key);
+	if (scan->rs_strategy != NULL)
+		FreeAccessStrategy(scan->rs_strategy);
 	pfree(scan);
 }

--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -12,7 +12,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.154 2007/01/05 22:19:23 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.155 2007/05/30 20:11:53 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -786,9 +786,10 @@ restart:
 	/*
 	 * We can't use _bt_getbuf() here because it always applies
 	 * _bt_checkpage(), which will barf on an all-zero page. We want to
-	 * recycle all-zero pages, not fail.
+	 * recycle all-zero pages, not fail.  Also, we want to use a nondefault
+	 * buffer access strategy.
 	 */
-	buf = ReadBuffer(rel, blkno);
+	buf = ReadBufferWithStrategy(rel, blkno, info->strategy);
 	LockBuffer(buf, BT_READ);
 	page = BufferGetPage(buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);

--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.269 2007/05/20 21:08:19 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.270 2007/05/30 20:11:55 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -1799,6 +1799,36 @@ XLogFlush(XLogRecPtr record)
 			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
 }
+/*
+ * Test whether XLOG data has been flushed up to (at least) the given position.
+ *
+ * Returns true if a flush is still needed.  (It may be that someone else
+ * is already in process of flushing that far, however.)
+ */
+bool
+XLogNeedsFlush(XLogRecPtr record)
+{
+	/* Quick exit if already known flushed */
+	if (XLByteLE(record, LogwrtResult.Flush))
+		return false;
+	/* read LogwrtResult and update local state */
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile XLogCtlData *xlogctl = XLogCtl;
+		SpinLockAcquire(&xlogctl->info_lck);
+		LogwrtResult = xlogctl->LogwrtResult;
+		SpinLockRelease(&xlogctl->info_lck);
+	}
+	/* check again */
+	if (XLByteLE(record, LogwrtResult.Flush))
+		return false;
+	return true;
+}
 /*
 * Create a new XLOG file segment, or open a pre-existing one.
 *

--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.283 2007/05/16 17:28:20 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.284 2007/05/30 20:11:55 tgl Exp $
 *
 *
 * INTERFACE ROUTINES
@@ -1658,6 +1658,7 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot)
 	ivinfo.vacuum_full = false;
 	ivinfo.message_level = DEBUG2;
 	ivinfo.num_heap_tuples = -1;
+	ivinfo.strategy = NULL;
 	state.tuplesort = tuplesort_begin_datum(TIDOID,
 											TIDLessOperator, false,

--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/analyze.c,v 1.107 2007/04/30 03:23:48 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/analyze.c,v 1.108 2007/05/30 20:11:56 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -63,10 +63,13 @@ typedef struct AnlIndexData
 /* Default statistics target (GUC parameter) */
 int			default_statistics_target = 10;
+/* A few variables that don't seem worth passing around as parameters */
 static int	elevel = -1;
 static MemoryContext anl_context = NULL;
+static BufferAccessStrategy vac_strategy;
 static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks,
 				  int samplesize);
@@ -94,7 +97,8 @@ static bool std_typanalyze(VacAttrStats *stats);
 *	analyze_rel() -- analyze one relation
 */
 void
-analyze_rel(Oid relid, VacuumStmt *vacstmt)
+analyze_rel(Oid relid, VacuumStmt *vacstmt,
+			BufferAccessStrategy bstrategy)
 {
 	Relation	onerel;
 	int			attr_cnt,
@@ -120,6 +124,8 @@ analyze_rel(Oid relid, VacuumStmt *vacstmt)
 	else
 		elevel = DEBUG2;
+	vac_strategy = bstrategy;
 	/*
 	 * Use the current context for storing analysis info.  vacuum.c ensures
 	 * that this context will be cleared when I return, thus releasing the
@@ -845,7 +851,7 @@ acquire_sample_rows(Relation onerel, HeapTuple *rows, int targrows,
 		 * looking at it.  We don't maintain a lock on the page, so tuples
 		 * could get added to it, but we ignore such tuples.
 		 */
-		targbuffer = ReadBuffer(onerel, targblock);
+		targbuffer = ReadBufferWithStrategy(onerel, targblock, vac_strategy);
 		LockBuffer(targbuffer, BUFFER_LOCK_SHARE);
 		targpage = BufferGetPage(targbuffer);
 		maxoffset = PageGetMaxOffsetNumber(targpage);

--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -13,7 +13,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.351 2007/05/17 15:28:29 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.352 2007/05/30 20:11:57 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -191,6 +191,7 @@ ExecContext_Finish(ExecContext ec)
 *----------------------------------------------------------------------
 */
+/* A few variables that don't seem worth passing around as parameters */
 static MemoryContext vac_context = NULL;
 static int	elevel = -1;
@@ -198,6 +199,8 @@ static int	elevel = -1;
 static TransactionId OldestXmin;
 static TransactionId FreezeLimit;
+static BufferAccessStrategy vac_strategy;
 /* non-export function prototypes */
 static List *get_rel_oids(List *relids, const RangeVar *vacrel,
@@ -257,14 +260,18 @@ static Size PageGetFreeSpaceWithFillFactor(Relation relation, Page page);
 * relation OIDs to be processed, and vacstmt->relation is ignored.
 * (The non-NIL case is currently only used by autovacuum.)
 *
+ * bstrategy is normally given as NULL, but in autovacuum it can be passed
+ * in to use the same buffer strategy object across multiple vacuum() calls.
+ *
 * isTopLevel should be passed down from ProcessUtility.
 *
- * It is the caller's responsibility that both vacstmt and relids
+ * It is the caller's responsibility that vacstmt, relids, and bstrategy
 * (if given) be allocated in a memory context that won't disappear
 * at transaction commit.
 */
 void
-vacuum(VacuumStmt *vacstmt, List *relids, bool isTopLevel)
+vacuum(VacuumStmt *vacstmt, List *relids,
+	   BufferAccessStrategy bstrategy, bool isTopLevel)
 {
 	const char *stmttype = vacstmt->vacuum ? "VACUUM" : "ANALYZE";
 	volatile MemoryContext anl_context = NULL;
@@ -319,6 +326,19 @@ vacuum(VacuumStmt *vacstmt, List *relids, bool isTopLevel)
 										ALLOCSET_DEFAULT_INITSIZE,
 										ALLOCSET_DEFAULT_MAXSIZE);
+	/*
+	 * If caller didn't give us a buffer strategy object, make one in the
+	 * cross-transaction memory context.
+	 */
+	if (bstrategy == NULL)
+	{
+		MemoryContext old_context = MemoryContextSwitchTo(vac_context);
+		bstrategy = GetAccessStrategy(BAS_VACUUM);
+		MemoryContextSwitchTo(old_context);
+	}
+	vac_strategy = bstrategy;
 	/* Remember whether we are processing everything in the DB */
 	all_rels = (relids == NIL && vacstmt->relation == NULL);
@@ -417,15 +437,7 @@ vacuum(VacuumStmt *vacstmt, List *relids, bool isTopLevel)
 				else
 					old_context = MemoryContextSwitchTo(anl_context);
-				/*
+				analyze_rel(relid, vacstmt, vac_strategy);
-				 * Tell the buffer replacement strategy that vacuum is causing
-				 * the IO
-				 */
-				StrategyHintVacuum(true);
-				analyze_rel(relid, vacstmt);
-				StrategyHintVacuum(false);
 				if (use_own_xacts)
 					CommitTransactionCommand();
@@ -441,8 +453,6 @@ vacuum(VacuumStmt *vacstmt, List *relids, bool isTopLevel)
 	{
 		/* Make sure cost accounting is turned off after error */
 		VacuumCostActive = false;
-		/* And reset buffer replacement strategy, too */
-		StrategyHintVacuum(false);
 		PG_RE_THROW();
 	}
 	PG_END_TRY();
@@ -1084,21 +1094,13 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind)
 	 */
 	toast_relid = onerel->rd_rel->reltoastrelid;
-	/*
-	 * Tell the cache replacement strategy that vacuum is causing all
-	 * following IO
-	 */
-	StrategyHintVacuum(true);
 	/*
 	 * Do the actual work --- either FULL or "lazy" vacuum
 	 */
 	if (vacstmt->full)
 		full_vacuum_rel(onerel, vacstmt);
 	else
-		lazy_vacuum_rel(onerel, vacstmt);
+		lazy_vacuum_rel(onerel, vacstmt, vac_strategy);
-	StrategyHintVacuum(false);
 	/* all done with this class, but hold lock until commit */
 	relation_close(onerel, NoLock);
@@ -1290,7 +1292,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 		vacuum_delay_point();
-		buf = ReadBuffer(onerel, blkno);
+		buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
 		page = BufferGetPage(buf);
 		/*
@@ -1730,7 +1732,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 		/*
 		 * Process this page of relation.
 		 */
-		buf = ReadBuffer(onerel, blkno);
+		buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
 		page = BufferGetPage(buf);
 		vacpage->offsets_free = 0;
@@ -1954,8 +1956,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 					nextTid = tp.t_data->t_ctid;
 					priorXmax = HeapTupleHeaderGetXmax(tp.t_data);
 					/* assume block# is OK (see heap_fetch comments) */
-					nextBuf = ReadBuffer(onerel,
+					nextBuf = ReadBufferWithStrategy(onerel,
-										 ItemPointerGetBlockNumber(&nextTid));
+										 ItemPointerGetBlockNumber(&nextTid),
+													 vac_strategy);
 					nextPage = BufferGetPage(nextBuf);
 					/* If bogus or unused slot, assume tp is end of chain */
 					nextOffnum = ItemPointerGetOffsetNumber(&nextTid);
@@ -2091,8 +2094,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 						break;	/* out of check-all-items loop */
 					}
 					tp.t_self = vtlp->this_tid;
-					Pbuf = ReadBuffer(onerel,
+					Pbuf = ReadBufferWithStrategy(onerel,
-									ItemPointerGetBlockNumber(&(tp.t_self)));
+									ItemPointerGetBlockNumber(&(tp.t_self)),
+												  vac_strategy);
 					Ppage = BufferGetPage(Pbuf);
 					Pitemid = PageGetItemId(Ppage,
 								   ItemPointerGetOffsetNumber(&(tp.t_self)));
@@ -2174,11 +2178,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 					/* Get page to move from */
 					tuple.t_self = vtmove[ti].tid;
-					Cbuf = ReadBuffer(onerel,
+					Cbuf = ReadBufferWithStrategy(onerel,
-								 ItemPointerGetBlockNumber(&(tuple.t_self)));
+								 ItemPointerGetBlockNumber(&(tuple.t_self)),
+												  vac_strategy);
 					/* Get page to move to */
-					dst_buffer = ReadBuffer(onerel, destvacpage->blkno);
+					dst_buffer = ReadBufferWithStrategy(onerel,
+														destvacpage->blkno,
+														vac_strategy);
 					LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
 					if (dst_buffer != Cbuf)
@@ -2239,7 +2246,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 				if (i == num_fraged_pages)
 					break;		/* can't move item anywhere */
 				dst_vacpage = fraged_pages->pagedesc[i];
-				dst_buffer = ReadBuffer(onerel, dst_vacpage->blkno);
+				dst_buffer = ReadBufferWithStrategy(onerel,
+													dst_vacpage->blkno,
+													vac_strategy);
 				LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
 				dst_page = BufferGetPage(dst_buffer);
 				/* if this page was not used before - clean it */
@@ -2386,7 +2395,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			Page		page;
 			/* this page was not used as a move target, so must clean it */
-			buf = ReadBuffer(onerel, (*curpage)->blkno);
+			buf = ReadBufferWithStrategy(onerel,
+										 (*curpage)->blkno,
+										 vac_strategy);
 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 			page = BufferGetPage(buf);
 			if (!PageIsEmpty(page))
@@ -2470,7 +2481,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			int			uncnt;
 			int			num_tuples = 0;
-			buf = ReadBuffer(onerel, vacpage->blkno);
+			buf = ReadBufferWithStrategy(onerel, vacpage->blkno, vac_strategy);
 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 			page = BufferGetPage(buf);
 			maxoff = PageGetMaxOffsetNumber(page);
@@ -2859,7 +2870,7 @@ update_hint_bits(Relation rel, VacPageList fraged_pages, int num_fraged_pages,
 			break;				/* no need to scan any further */
 		if ((*curpage)->offsets_used == 0)
 			continue;			/* this page was never used as a move dest */
-		buf = ReadBuffer(rel, (*curpage)->blkno);
+		buf = ReadBufferWithStrategy(rel, (*curpage)->blkno, vac_strategy);
 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 		page = BufferGetPage(buf);
 		max_offset = PageGetMaxOffsetNumber(page);
@@ -2925,7 +2936,9 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
 		if ((*vacpage)->offsets_free > 0)
 		{
-			buf = ReadBuffer(onerel, (*vacpage)->blkno);
+			buf = ReadBufferWithStrategy(onerel,
+										 (*vacpage)->blkno,
+										 vac_strategy);
 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 			vacuum_page(onerel, buf, *vacpage);
 			UnlockReleaseBuffer(buf);
@@ -3012,6 +3025,7 @@ scan_index(Relation indrel, double num_tuples)
 	ivinfo.vacuum_full = true;
 	ivinfo.message_level = elevel;
 	ivinfo.num_heap_tuples = num_tuples;
+	ivinfo.strategy = vac_strategy;
 	stats = index_vacuum_cleanup(&ivinfo, NULL);
@@ -3077,6 +3091,7 @@ vacuum_index(VacPageList vacpagelist, Relation indrel,
 	ivinfo.vacuum_full = true;
 	ivinfo.message_level = elevel;
 	ivinfo.num_heap_tuples = num_tuples + keep_tuples;
+	ivinfo.strategy = vac_strategy;
 	/* Do bulk deletion */
 	stats = index_bulk_delete(&ivinfo, NULL, tid_reaped, (void *) vacpagelist);

--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -36,7 +36,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.89 2007/05/17 15:28:29 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.90 2007/05/30 20:11:57 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -96,11 +96,14 @@ typedef struct LVRelStats
 } LVRelStats;
+/* A few variables that don't seem worth passing around as parameters */
 static int	elevel = -1;
 static TransactionId OldestXmin;
 static TransactionId FreezeLimit;
+static BufferAccessStrategy vac_strategy;
 /* non-export function prototypes */
 static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
@@ -138,7 +141,8 @@ static int	vac_cmp_page_spaces(const void *left, const void *right);
 *		and locked the relation.
 */
 void
-lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
+lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
+				BufferAccessStrategy bstrategy)
 {
 	LVRelStats *vacrelstats;
 	Relation   *Irel;
@@ -158,6 +162,8 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
 	else
 		elevel = DEBUG2;
+	vac_strategy = bstrategy;
 	vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared,
 						  &OldestXmin, &FreezeLimit);
@@ -318,7 +324,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 			vacrelstats->num_index_scans++;
 		}
-		buf = ReadBuffer(onerel, blkno);
+		buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
 		/* Initially, we only need shared access to the buffer */
 		LockBuffer(buf, BUFFER_LOCK_SHARE);
@@ -586,7 +592,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
 		vacuum_delay_point();
 		tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
-		buf = ReadBuffer(onerel, tblk);
+		buf = ReadBufferWithStrategy(onerel, tblk, vac_strategy);
 		LockBufferForCleanup(buf);
 		tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats);
 		/* Now that we've compacted the page, record its available space */
@@ -684,6 +690,7 @@ lazy_vacuum_index(Relation indrel,
 	ivinfo.message_level = elevel;
 	/* We don't yet know rel_tuples, so pass -1 */
 	ivinfo.num_heap_tuples = -1;
+	ivinfo.strategy = vac_strategy;
 	/* Do bulk deletion */
 	*stats = index_bulk_delete(&ivinfo, *stats,
@@ -713,6 +720,7 @@ lazy_cleanup_index(Relation indrel,
 	ivinfo.vacuum_full = false;
 	ivinfo.message_level = elevel;
 	ivinfo.num_heap_tuples = vacrelstats->rel_tuples;
+	ivinfo.strategy = vac_strategy;
 	stats = index_vacuum_cleanup(&ivinfo, stats);
@@ -869,7 +877,7 @@ count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
 		blkno--;
-		buf = ReadBuffer(onerel, blkno);
+		buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
 		/* In this phase we only need shared access to the buffer */
 		LockBuffer(buf, BUFFER_LOCK_SHARE);

--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -10,7 +10,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/autovacuum.c,v 1.46 2007/05/07 20:41:24 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/autovacuum.c,v 1.47 2007/05/30 20:11:57 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -218,7 +218,8 @@ static void relation_needs_vacanalyze(Oid relid, Form_pg_autovacuum avForm,
 						  bool *doanalyze);
 static void autovacuum_do_vac_analyze(Oid relid, bool dovacuum,
-						  bool doanalyze, int freeze_min_age);
+						  bool doanalyze, int freeze_min_age,
+						  BufferAccessStrategy bstrategy);
 static HeapTuple get_pg_autovacuum_tuple_relid(Relation avRel, Oid relid);
 static PgStat_StatTabEntry *get_pgstat_tabentry_relid(Oid relid, bool isshared,
 						  PgStat_StatDBEntry *shared,
@@ -1673,6 +1674,7 @@ do_autovacuum(void)
 	ListCell   *cell;
 	PgStat_StatDBEntry *shared;
 	PgStat_StatDBEntry *dbentry;
+	BufferAccessStrategy bstrategy;
 	/*
 	 * may be NULL if we couldn't find an entry (only happens if we
@@ -1812,6 +1814,13 @@ do_autovacuum(void)
 	list_free(toast_oids);
 	toast_oids = NIL;
+	/*
+	 * Create a buffer access strategy object for VACUUM to use.  We want
+	 * to use the same one across all the vacuum operations we perform,
+	 * since the point is for VACUUM not to blow out the shared cache.
+	 */
+	bstrategy = GetAccessStrategy(BAS_VACUUM);
 	/*
 	 * Perform operations on collected tables.
 	 */
@@ -1910,7 +1919,8 @@ next_worker:
 		autovacuum_do_vac_analyze(tab->at_relid,
 								  tab->at_dovacuum,
 								  tab->at_doanalyze,
-								  tab->at_freeze_min_age);
+								  tab->at_freeze_min_age,
+								  bstrategy);
 		/* be tidy */
 		pfree(tab);
 	}
@@ -2328,7 +2338,8 @@ relation_needs_vacanalyze(Oid relid,
 */
 static void
 autovacuum_do_vac_analyze(Oid relid, bool dovacuum, bool doanalyze,
-						  int freeze_min_age)
+						  int freeze_min_age,
+						  BufferAccessStrategy bstrategy)
 {
 	VacuumStmt	vacstmt;
 	MemoryContext old_cxt;
@@ -2354,7 +2365,7 @@ autovacuum_do_vac_analyze(Oid relid, bool dovacuum, bool doanalyze,
 	/* Let pgstat know what we're doing */
 	autovac_report_activity(&vacstmt, relid);
-	vacuum(&vacstmt, list_make1_oid(relid), true);
+	vacuum(&vacstmt, list_make1_oid(relid), bstrategy, true);
 	MemoryContextSwitchTo(old_cxt);
 }

--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
-$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.11 2006/07/23 03:07:58 tgl Exp $
+$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.12 2007/05/30 20:11:58 tgl Exp $
 Notes about shared buffer access rules
 --------------------------------------
@@ -152,20 +152,21 @@ we could use per-backend LWLocks instead (a buffer header would then contain
 a field to show which backend is doing its I/O).
-Buffer replacement strategy
+Normal buffer replacement strategy
---------------------------
+----------------------------------
 There is a "free list" of buffers that are prime candidates for replacement.
 In particular, buffers that are completely free (contain no valid page) are
-always in this list.  We may also throw buffers into this list if we
+always in this list.  We could also throw buffers into this list if we
-consider their pages unlikely to be needed soon.  The list is singly-linked
+consider their pages unlikely to be needed soon; however, the current
-using fields in the buffer headers; we maintain head and tail pointers in
+algorithm never does that.  The list is singly-linked using fields in the
-global variables.  (Note: although the list links are in the buffer headers,
+buffer headers; we maintain head and tail pointers in global variables.
-they are considered to be protected by the BufFreelistLock, not the
+(Note: although the list links are in the buffer headers, they are
-buffer-header spinlocks.)  To choose a victim buffer to recycle when there
+considered to be protected by the BufFreelistLock, not the buffer-header
-are no free buffers available, we use a simple clock-sweep algorithm, which
+spinlocks.)  To choose a victim buffer to recycle when there are no free
-avoids the need to take system-wide locks during common operations.  It
+buffers available, we use a simple clock-sweep algorithm, which avoids the
-works like this:
+need to take system-wide locks during common operations.  It works like
+this:
 Each buffer header contains a usage counter, which is incremented (up to a
 small limit value) whenever the buffer is unpinned.  (This requires only the
@@ -199,22 +200,40 @@ before we can recycle it; if someone else pins the buffer meanwhile we will
 have to give up and try another buffer.  This however is not a concern
 of the basic select-a-victim-buffer algorithm.)
-A special provision is that while running VACUUM, a backend does not
-increment the usage count on buffers it accesses.  In fact, if ReleaseBuffer
+Buffer ring replacement strategy
-sees that it is dropping the pin count to zero and the usage count is zero,
+---------------------------------
-then it appends the buffer to the tail of the free list.  (This implies that
-VACUUM, but only VACUUM, must take the BufFreelistLock during ReleaseBuffer;
+When running a query that needs to access a large number of pages just once,
-this shouldn't create much of a contention problem.)  This provision
+such as VACUUM or a large sequential scan, a different strategy is used.
-encourages VACUUM to work in a relatively small number of buffers rather
+A page that has been touched only by such a scan is unlikely to be needed
-than blowing out the entire buffer cache.  It is reasonable since a page
+again soon, so instead of running the normal clock sweep algorithm and
-that has been touched only by VACUUM is unlikely to be needed again soon.
+blowing out the entire buffer cache, a small ring of buffers is allocated
+using the normal clock sweep algorithm and those buffers are reused for the
-Since VACUUM usually requests many pages very fast, the effect of this is that
+whole scan.  This also implies that much of the write traffic caused by such
-it will get back the very buffers it filled and possibly modified on the next
+a statement will be done by the backend itself and not pushed off onto other
-call and will therefore do its work in a few shared memory buffers, while
+processes.
-being able to use whatever it finds in the cache already.  This also implies
-that most of the write traffic caused by a VACUUM will be done by the VACUUM
+For sequential scans, a 256KB ring is used. That's small enough to fit in L2
-itself and not pushed off onto other processes.
+cache, which makes transferring pages from OS cache to shared buffer cache
+efficient.  Even less would often be enough, but the ring must be big enough
+to accommodate all pages in the scan that are pinned concurrently.  256KB
+should also be enough to leave a small cache trail for other backends to
+join in a synchronized seq scan.  If a ring buffer is dirtied and its LSN
+updated, we would normally have to write and flush WAL before we could
+re-use the buffer; in this case we instead discard the buffer from the ring
+and (later) choose a replacement using the normal clock-sweep algorithm.
+Hence this strategy works best for scans that are read-only (or at worst
+update hint bits).  In a scan that modifies every page in the scan, like a
+bulk UPDATE or DELETE, the buffers in the ring will always be dirtied and
+the ring strategy effectively degrades to the normal strategy.
+VACUUM uses a 256KB ring like sequential scans, but dirty pages are not
+removed from the ring.  Instead, WAL is flushed if needed to allow reuse of
+the buffers.  Before introducing the buffer ring strategy in 8.3, VACUUM's
+buffers were sent to the freelist, which was effectively a buffer ring of 1
+buffer, resulting in excessive WAL flushing.  Allowing VACUUM to update
+256KB between WAL flushes should be more efficient.
 Background writer's processing

--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -9,7 +9,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.76 2007/01/05 22:19:37 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.77 2007/05/30 20:11:59 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -57,7 +57,8 @@ static Block GetLocalBufferStorage(void);
 *
 * API is similar to bufmgr.c's BufferAlloc, except that we do not need
 * to do any locking since this is all local.	Also, IO_IN_PROGRESS
- * does not get set.
+ * does not get set.  Lastly, we support only default access strategy
+ * (hence, usage_count is always advanced).
 */
 BufferDesc *
 LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
@@ -88,7 +89,12 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
 		fprintf(stderr, "LB ALLOC (%u,%d) %d\n",
 				RelationGetRelid(reln), blockNum, -b - 1);
 #endif
+		/* this part is equivalent to PinBuffer for a shared buffer */
+		if (LocalRefCount[b] == 0)
+		{
+			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
+				bufHdr->usage_count++;
+		}
 		LocalRefCount[b]++;
 		ResourceOwnerRememberBuffer(CurrentResourceOwner,
 									BufferDescriptorGetBuffer(bufHdr));
@@ -121,18 +127,21 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
 		bufHdr = &LocalBufferDescriptors[b];
-		if (LocalRefCount[b] == 0 && bufHdr->usage_count == 0)
+		if (LocalRefCount[b] == 0)
-		{
-			LocalRefCount[b]++;
-			ResourceOwnerRememberBuffer(CurrentResourceOwner,
-										BufferDescriptorGetBuffer(bufHdr));
-			break;
-		}
-		if (bufHdr->usage_count > 0)
 		{
-			bufHdr->usage_count--;
+			if (bufHdr->usage_count > 0)
-			trycounter = NLocBuffer;
+			{
+				bufHdr->usage_count--;
+				trycounter = NLocBuffer;
+			}
+			else
+			{
+				/* Found a usable buffer */
+				LocalRefCount[b]++;
+				ResourceOwnerRememberBuffer(CurrentResourceOwner,
+											BufferDescriptorGetBuffer(bufHdr));
+				break;
+			}
 		}
 		else if (--trycounter == 0)
 			ereport(ERROR,
@@ -199,7 +208,7 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
 	bufHdr->tag = newTag;
 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
 	bufHdr->flags |= BM_TAG_VALID;
-	bufHdr->usage_count = 0;
+	bufHdr->usage_count = 1;
 	*foundPtr = FALSE;
 	return bufHdr;

--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -10,7 +10,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tcop/utility.c,v 1.279 2007/04/27 22:05:49 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tcop/utility.c,v 1.280 2007/05/30 20:12:01 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -931,7 +931,7 @@ ProcessUtility(Node *parsetree,
 			break;
 		case T_VacuumStmt:
-			vacuum((VacuumStmt *) parsetree, NIL, isTopLevel);
+			vacuum((VacuumStmt *) parsetree, NIL, NULL, isTopLevel);
 			break;
 		case T_ExplainStmt:

--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/access/genam.h,v 1.66 2007/01/05 22:19:50 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/genam.h,v 1.67 2007/05/30 20:12:02 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -40,6 +40,7 @@ typedef struct IndexVacuumInfo
 	bool		vacuum_full;	/* VACUUM FULL (we have exclusive lock) */
 	int			message_level;	/* ereport level for progress messages */
 	double		num_heap_tuples;	/* tuples remaining in heap */
+	BufferAccessStrategy strategy;	/* access strategy for reads */
 } IndexVacuumInfo;
 /*

--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.80 2007/05/03 16:45:58 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.81 2007/05/30 20:12:02 tgl Exp $
 *
 * NOTES
 *		modeled after Margo Seltzer's hash implementation for unix.
@@ -273,11 +273,13 @@ extern void _hash_doinsert(Relation rel, IndexTuple itup);
 /* hashovfl.c */
 extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf);
-extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf);
+extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf,
+									  BufferAccessStrategy bstrategy);
 extern void _hash_initbitmap(Relation rel, HashMetaPage metap,
 				 BlockNumber blkno);
 extern void _hash_squeezebucket(Relation rel,
-					Bucket bucket, BlockNumber bucket_blkno);
+								Bucket bucket, BlockNumber bucket_blkno,
+								BufferAccessStrategy bstrategy);
 /* hashpage.c */
 extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access);
@@ -287,6 +289,9 @@ extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno,
 						   int access, int flags);
 extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
 extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno);
+extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
+										 int access, int flags,
+										 BufferAccessStrategy bstrategy);
 extern void _hash_relbuf(Relation rel, Buffer buf);
 extern void _hash_dropbuf(Relation rel, Buffer buf);
 extern void _hash_wrtbuf(Relation rel, Buffer buf);

--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.53 2007/05/27 03:50:39 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.54 2007/05/30 20:12:02 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -27,6 +27,7 @@ typedef struct HeapScanDescData
 	int			rs_nkeys;		/* number of scan keys */
 	ScanKey		rs_key;			/* array of scan key descriptors */
 	BlockNumber rs_nblocks;		/* number of blocks to scan */
+	BufferAccessStrategy rs_strategy;	/* access strategy for reads */
 	bool		rs_pageatatime; /* verify visibility page-at-a-time? */
 	/* scan current state */

--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -6,7 +6,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.77 2007/05/20 21:08:19 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.78 2007/05/30 20:12:02 tgl Exp $
 */
 #ifndef XLOG_H
 #define XLOG_H
@@ -159,6 +159,7 @@ extern bool XLOG_DEBUG;
 extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata);
 extern void XLogFlush(XLogRecPtr RecPtr);
+extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
 extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
 extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);

--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/commands/vacuum.h,v 1.71 2007/05/17 15:28:29 alvherre Exp $
+ * $PostgreSQL: pgsql/src/include/commands/vacuum.h,v 1.72 2007/05/30 20:12:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -18,9 +18,11 @@
 #include "catalog/pg_statistic.h"
 #include "catalog/pg_type.h"
 #include "nodes/parsenodes.h"
+#include "storage/buf.h"
 #include "storage/lock.h"
 #include "utils/rel.h"
 /*----------
 * ANALYZE builds one of these structs for each attribute (column) that is
 * to be analyzed.	The struct and subsidiary data are in anl_context,
@@ -110,7 +112,8 @@ extern int	vacuum_freeze_min_age;
 /* in commands/vacuum.c */
-extern void vacuum(VacuumStmt *vacstmt, List *relids, bool isTopLevel);
+extern void vacuum(VacuumStmt *vacstmt, List *relids,
+				   BufferAccessStrategy bstrategy, bool isTopLevel);
 extern void vac_open_indexes(Relation relation, LOCKMODE lockmode,
 				 int *nindexes, Relation **Irel);
 extern void vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode);
@@ -127,9 +130,11 @@ extern bool vac_is_partial_index(Relation indrel);
 extern void vacuum_delay_point(void);
 /* in commands/vacuumlazy.c */
-extern void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt);
+extern void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
+							BufferAccessStrategy bstrategy);
 /* in commands/analyze.c */
-extern void analyze_rel(Oid relid, VacuumStmt *vacstmt);
+extern void analyze_rel(Oid relid, VacuumStmt *vacstmt,
+						BufferAccessStrategy bstrategy);
 #endif   /* VACUUM_H */
--- a/src/include/storage/buf.h
+++ b/src/include/storage/buf.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/storage/buf.h,v 1.21 2007/01/05 22:19:57 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/buf.h,v 1.22 2007/05/30 20:12:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -36,4 +36,11 @@ typedef int Buffer;
 */
 #define BufferIsLocal(buffer)	((buffer) < 0)
+/*
+ * Buffer access strategy objects.
+ *
+ * BufferAccessStrategyData is private to freelist.c
+ */
+typedef struct BufferAccessStrategyData *BufferAccessStrategy;
 #endif   /* BUF_H */
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -8,7 +8,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.89 2007/01/05 22:19:57 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.90 2007/05/30 20:12:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -167,9 +167,6 @@ extern DLLIMPORT BufferDesc *BufferDescriptors;
 /* in localbuf.c */
 extern BufferDesc *LocalBufferDescriptors;
-/* in freelist.c */
-extern bool strategy_hint_vacuum;
 /* event counters in buf_init.c */
 extern long int ReadBufferCount;
 extern long int ReadLocalBufferCount;
@@ -184,8 +181,12 @@ extern long int LocalBufferFlushCount;
 */
 /* freelist.c */
-extern volatile BufferDesc *StrategyGetBuffer(void);
+extern volatile BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
-extern void StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head);
+											  bool *lock_held);
+extern void StrategyFreeBuffer(volatile BufferDesc *buf);
+extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
+								 volatile BufferDesc *buf);
 extern int	StrategySyncStart(void);
 extern Size StrategyShmemSize(void);
 extern void StrategyInitialize(bool init);

--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.103 2007/05/02 23:18:03 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.104 2007/05/30 20:12:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -19,6 +19,14 @@
 typedef void *Block;
+/* Possible arguments for GetAccessStrategy() */
+typedef enum BufferAccessStrategyType
+{
+	BAS_NORMAL,		/* Normal random access */
+	BAS_BULKREAD,	/* Large read-only scan (hint bit updates are ok) */
+	BAS_VACUUM		/* VACUUM */
+} BufferAccessStrategyType;
 /* in globals.c ... this duplicates miscadmin.h */
 extern DLLIMPORT int NBuffers;
@@ -111,6 +119,8 @@ extern DLLIMPORT int32 *LocalRefCount;
 * prototypes for functions in bufmgr.c
 */
 extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
+extern Buffer ReadBufferWithStrategy(Relation reln, BlockNumber blockNum,
+									 BufferAccessStrategy strategy);
 extern Buffer ReadOrZeroBuffer(Relation reln, BlockNumber blockNum);
 extern void ReleaseBuffer(Buffer buffer);
 extern void UnlockReleaseBuffer(Buffer buffer);
@@ -157,6 +167,7 @@ extern void BgBufferSync(void);
 extern void AtProcExit_LocalBuffers(void);
 /* in freelist.c */
-extern void StrategyHintVacuum(bool vacuum_active);
+extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype);
+extern void FreeAccessStrategy(BufferAccessStrategy strategy);
 #endif