Replace the BufMgrLock with separate locks on the lookup hashtable and

the freelist, plus per-buffer spinlocks that protect access to individual shared buffer headers. This requires abandoning a global freelist (since the freelist is a global contention point), which shoots down ARC and 2Q as well as plain LRU management. Adopt a clock sweep algorithm instead. Preliminary results show substantial improvement in multi-backend situations.

Replace the BufMgrLock with separate locks on the lookup hashtable and
the freelist, plus per-buffer spinlocks that protect access to individual shared buffer headers. This requires abandoning a global freelist (since the freelist is a global contention point), which shoots down ARC and 2Q as well as plain LRU management. Adopt a clock sweep algorithm instead. Preliminary results show substantial improvement in multi-backend situations.
5d508736 · Tom Lane · 5592a6cf · 5d508736 · 5d508736 · 5d508736
Commit 5d508736 authored Mar 04, 2005 by Tom Lane
18 changed files
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
 <!--
-$PostgreSQL: pgsql/doc/src/sgml/runtime.sgml,v 1.306 2005/03/02 19:58:54 tgl Exp $
+$PostgreSQL: pgsql/doc/src/sgml/runtime.sgml,v 1.307 2005/03/04 20:21:05 tgl Exp $
 -->
 <chapter Id="runtime">
@@ -1379,9 +1379,7 @@ SET ENABLE_SEQSCAN TO OFF;
         Specifies the delay between activity rounds for the
         background writer.  In each round the writer issues writes
         for some number of dirty buffers (controllable by the
-         following parameters).  The selected buffers will always be
+         following parameters).  It then sleeps for <varname>bgwriter_delay</>
-         the least recently used ones among the currently dirty
-         buffers.  It then sleeps for <varname>bgwriter_delay</>
         milliseconds, and repeats.  The default value is 200. Note
         that on many systems, the effective resolution of sleep
         delays is 10 milliseconds; setting <varname>bgwriter_delay</>
@@ -1393,32 +1391,77 @@ SET ENABLE_SEQSCAN TO OFF;
       </listitem>
      </varlistentry>
-      <varlistentry id="guc-bgwriter-percent" xreflabel="bgwriter_percent">
+      <varlistentry id="guc-bgwriter-lru-percent" xreflabel="bgwriter_lru_percent">
-       <term><varname>bgwriter_percent</varname> (<type>integer</type>)</term>
+       <term><varname>bgwriter_lru_percent</varname> (<type>floating point</type>)</term>
       <indexterm>
-        <primary><varname>bgwriter_percent</> configuration parameter</primary>
+        <primary><varname>bgwriter_lru_percent</> configuration parameter</primary>
       </indexterm>
       <listitem>
        <para>
-         In each round, no more than this percentage of the currently
+         To reduce the probability that server processes will need to issue
-         dirty buffers will be written (rounding up any fraction to
+         their own writes, the background writer tries to write buffers that
-         the next whole number of buffers).  The default value is
+         are likely to be recycled soon.  In each round, it examines up to
-         1. This option can only be set at server start or in the
+         <varname>bgwriter_lru_percent</> of the buffers that are nearest to
+         being recycled, and writes any that are dirty.
+         The default value is 1.0 (this is a percentage of the total number
+         of shared buffers).
+         This option can only be set at server start or in the
+         <filename>postgresql.conf</filename> file.
+        </para>
+       </listitem>
+      </varlistentry>
+      <varlistentry id="guc-bgwriter-lru-maxpages" xreflabel="bgwriter_lru_maxpages">
+       <term><varname>bgwriter_lru_maxpages</varname> (<type>integer</type>)</term>
+       <indexterm>
+        <primary><varname>bgwriter_lru_maxpages</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         In each round, no more than this many buffers will be written
+         as a result of scanning soon-to-be-recycled buffers.
+         The default value is 5.
+         This option can only be set at server start or in the
+         <filename>postgresql.conf</filename> file.
+        </para>
+       </listitem>
+      </varlistentry>
+      <varlistentry id="guc-bgwriter-all-percent" xreflabel="bgwriter_all_percent">
+       <term><varname>bgwriter_all_percent</varname> (<type>floating point</type>)</term>
+       <indexterm>
+        <primary><varname>bgwriter_all_percent</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         To reduce the amount of work that will be needed at checkpoint time,
+         the background writer also does a circular scan through the entire
+         buffer pool, writing buffers that are found to be dirty.
+         In each round, it examines up to
+         <varname>bgwriter_all_percent</> of the buffers for this purpose.
+         The default value is 0.333 (this is a percentage of the total number
+         of shared buffers).  With the default <varname>bgwriter_delay</>
+         setting, this will allow the entire shared buffer pool to be scanned
+         about once per minute.
+         This option can only be set at server start or in the
         <filename>postgresql.conf</filename> file.
        </para>
       </listitem>
      </varlistentry>
-      <varlistentry id="guc-bgwriter-maxpages" xreflabel="bgwriter_maxpages">
+      <varlistentry id="guc-bgwriter-all-maxpages" xreflabel="bgwriter_all_maxpages">
-       <term><varname>bgwriter_maxpages</varname> (<type>integer</type>)</term>
+       <term><varname>bgwriter_all_maxpages</varname> (<type>integer</type>)</term>
       <indexterm>
-        <primary><varname>bgwriter_maxpages</> configuration parameter</primary>
+        <primary><varname>bgwriter_all_maxpages</> configuration parameter</primary>
       </indexterm>
       <listitem>
        <para>
-         In each round, no more than this many dirty buffers will be
+         In each round, no more than this many buffers will be written
-         written. The default value is 100. This option can only be
+         as a result of the scan of the entire buffer pool.  (If this
-         set at server start or in the
+         limit is reached, the scan stops, and resumes at the next buffer
+         during the next round.)
+         The default value is 5.
+         This option can only be set at server start or in the
         <filename>postgresql.conf</filename> file.
        </para>
       </listitem>
@@ -1426,13 +1469,19 @@ SET ENABLE_SEQSCAN TO OFF;
     </variablelist>
     <para>
-      Smaller values of <varname>bgwriter_percent</varname> and
+      Smaller values of <varname>bgwriter_all_percent</varname> and
-      <varname>bgwriter_maxpages</varname> reduce the extra I/O load
+      <varname>bgwriter_all_maxpages</varname> reduce the extra I/O load
      caused by the background writer, but leave more work to be done
      at checkpoint time.  To reduce load spikes at checkpoints,
-      increase the values.  To disable background writing entirely,
+      increase these two values.
-      set <varname>bgwriter_percent</varname> and/or
+      Similarly, smaller values of <varname>bgwriter_lru_percent</varname> and
-      <varname>bgwriter_maxpages</varname> to zero.
+      <varname>bgwriter_lru_maxpages</varname> reduce the extra I/O load
+      caused by the background writer, but make it more likely that server
+      processes will have to issue writes for themselves, delaying interactive
+      queries.
+      To disable background writing entirely,
+      set both <varname>maxpages</varname> values and/or both
+      <varname>percent</varname> values to zero.
     </para>
    </sect3>
@@ -3866,20 +3915,6 @@ plruby.bar = true        # generates error, unknown class name
      </listitem>
     </varlistentry>
-     <varlistentry id="guc-debug-shared-buffers" xreflabel="debug_shared_buffers">
-      <term><varname>debug_shared_buffers</varname> (<type>integer</type>)</term>
-      <indexterm>
-       <primary><varname>debug_shared_buffers</> configuration parameter</primary>
-      </indexterm>
-      <listitem>
-       <para>
-        Number of seconds between ARC reports.
-        If set greater than zero, emit ARC statistics to the log every so many
-        seconds.  Zero (the default) disables reporting.
-       </para>
-      </listitem>
-     </varlistentry>
     <varlistentry id="guc-pre-auth-delay" xreflabel="pre_auth_delay">
      <term><varname>pre_auth_delay</varname> (<type>integer</type>)</term>
      <indexterm>

--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.244 2005/01/10 20:02:19 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.245 2005/03/04 20:21:05 tgl Exp $
 *
 *
 * INTERFACE ROUTINES
@@ -1060,7 +1060,6 @@ setRelhasindex(Oid relid, bool hasindex, bool isprimary, Oid reltoastidxid)
 		/* Send out shared cache inval if necessary */
 		if (!IsBootstrapProcessingMode())
 			CacheInvalidateHeapTuple(pg_class, tuple);
-		BufferSync(-1, -1);
 	}
 	else if (dirty)
 	{

--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -15,7 +15,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.151 2005/02/26 18:43:33 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.152 2005/03/04 20:21:05 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -339,7 +339,7 @@ createdb(const CreatedbStmt *stmt)
 	 * up-to-date for the copy.  (We really only need to flush buffers for
 	 * the source database, but bufmgr.c provides no API for that.)
 	 */
-	BufferSync(-1, -1);
+	BufferSync();
 	/*
 	 * Close virtual file descriptors so the kernel has more available for
@@ -1201,7 +1201,7 @@ dbase_redo(XLogRecPtr lsn, XLogRecord *record)
 		 * up-to-date for the copy.  (We really only need to flush buffers for
 		 * the source database, but bufmgr.c provides no API for that.)
 		 */
-		BufferSync(-1, -1);
+		BufferSync();
 #ifndef WIN32

--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -13,7 +13,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.302 2005/02/26 18:43:33 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.303 2005/03/04 20:21:06 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -36,7 +36,6 @@
 #include "commands/vacuum.h"
 #include "executor/executor.h"
 #include "miscadmin.h"
-#include "storage/buf_internals.h"
 #include "storage/freespace.h"
 #include "storage/sinval.h"
 #include "storage/smgr.h"

--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -37,7 +37,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.14 2005/02/19 23:16:15 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.15 2005/03/04 20:21:06 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -116,9 +116,6 @@ static BgWriterShmemStruct *BgWriterShmem;
 * GUC parameters
 */
 int			BgWriterDelay = 200;
-int			BgWriterPercent = 1;
-int			BgWriterMaxPages = 100;
 int			CheckPointTimeout = 300;
 int			CheckPointWarning = 30;
@@ -274,7 +271,6 @@ BackgroundWriterMain(void)
 		bool		force_checkpoint = false;
 		time_t		now;
 		int			elapsed_secs;
-		int			n;
 		long		udelay;
 		/*
@@ -365,16 +361,13 @@ BackgroundWriterMain(void)
 			 * checkpoints happen at a predictable spacing.
 			 */
 			last_checkpoint_time = now;
-			/* Nap for configured time before rechecking */
-			n = 1;
 		}
 		else
-			n = BufferSync(BgWriterPercent, BgWriterMaxPages);
+			BgBufferSync();
 		/*
-		 * Nap for the configured time or sleep for 10 seconds if there
+		 * Nap for the configured time, or sleep for 10 seconds if there
-		 * was nothing to do at all.
+		 * is no bgwriter activity configured.
 		 *
 		 * On some platforms, signals won't interrupt the sleep.  To ensure
 		 * we respond reasonably promptly when someone signals us, break
@@ -383,7 +376,11 @@ BackgroundWriterMain(void)
 		 *
 		 * We absorb pending requests after each short sleep.
 		 */
-		udelay = ((n > 0) ? BgWriterDelay : 10000) * 1000L;
+		if ((bgwriter_all_percent > 0.0 && bgwriter_all_maxpages > 0) ||
+			(bgwriter_lru_percent > 0.0 && bgwriter_lru_maxpages > 0))
+			udelay = BgWriterDelay * 1000L;
+		else
+			udelay = 10000000L;
 		while (udelay > 1000000L)
 		{
 			if (got_SIGHUP || checkpoint_requested || shutdown_requested)

--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
-$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.7 2004/04/19 23:27:17 tgl Exp $
+$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.8 2005/03/04 20:21:06 tgl Exp $
 Notes about shared buffer access rules
 --------------------------------------
 There are two separate access control mechanisms for shared disk buffers:
-reference counts (a/k/a pin counts) and buffer locks.  (Actually, there's
+reference counts (a/k/a pin counts) and buffer content locks.  (Actually,
-a third level of access control: one must hold the appropriate kind of
+there's a third level of access control: one must hold the appropriate kind
-lock on a relation before one can legally access any page belonging to
+of lock on a relation before one can legally access any page belonging to
 the relation.  Relation-level locks are not discussed here.)
 Pins: one must "hold a pin on" a buffer (increment its reference count)
@@ -26,7 +26,7 @@ handled by waiting to obtain the relation-level lock, which is why you'd
 better hold one first.)  Pins may not be held across transaction
 boundaries, however.
-Buffer locks: there are two kinds of buffer locks, shared and exclusive,
+Buffer content locks: there are two kinds of buffer lock, shared and exclusive,
 which act just as you'd expect: multiple backends can hold shared locks on
 the same buffer, but an exclusive lock prevents anyone else from holding
 either shared or exclusive lock.  (These can alternatively be called READ
@@ -38,12 +38,12 @@ the same buffer.  One must pin a buffer before trying to lock it.
 Buffer access rules:
 1. To scan a page for tuples, one must hold a pin and either shared or
-exclusive lock.  To examine the commit status (XIDs and status bits) of
+exclusive content lock.  To examine the commit status (XIDs and status bits)
-a tuple in a shared buffer, one must likewise hold a pin and either shared
+of a tuple in a shared buffer, one must likewise hold a pin and either shared
 or exclusive lock.
 2. Once one has determined that a tuple is interesting (visible to the
-current transaction) one may drop the buffer lock, yet continue to access
+current transaction) one may drop the content lock, yet continue to access
 the tuple's data for as long as one holds the buffer pin.  This is what is
 typically done by heap scans, since the tuple returned by heap_fetch
 contains a pointer to tuple data in the shared buffer.  Therefore the
@@ -52,9 +52,9 @@ change, but that is assumed not to matter after the initial determination
 of visibility is made.
 3. To add a tuple or change the xmin/xmax fields of an existing tuple,
-one must hold a pin and an exclusive lock on the containing buffer.
+one must hold a pin and an exclusive content lock on the containing buffer.
 This ensures that no one else might see a partially-updated state of the
-tuple.
+tuple while they are doing visibility checks.
 4. It is considered OK to update tuple commit status bits (ie, OR the
 values HEAP_XMIN_COMMITTED, HEAP_XMIN_INVALID, HEAP_XMAX_COMMITTED, or
@@ -76,7 +76,7 @@ no other backend can be holding a reference to an existing tuple that it
 might expect to examine again.  Note that another backend might pin the
 buffer (increment the refcount) while one is performing the cleanup, but
 it won't be able to actually examine the page until it acquires shared
-or exclusive lock.
+or exclusive content lock.
 VACUUM FULL ignores rule #5, because it instead acquires exclusive lock at
@@ -97,149 +97,142 @@ for VACUUM's use, since we don't allow multiple VACUUMs concurrently on a
 single relation anyway.
-Buffer replacement strategy interface
+Buffer manager's internal locking
-------------------------------------
+---------------------------------
-The file freelist.c contains the buffer cache replacement strategy.
+Before PostgreSQL 8.1, all operations of the shared buffer manager itself
-The interface to the strategy is:
+were protected by a single system-wide lock, the BufMgrLock, which
+unsurprisingly proved to be a source of contention.  The new locking scheme
-	BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck,
+avoids grabbing system-wide exclusive locks in common code paths.  It works
-	                                 int *cdb_found_index)
+like this:
-This is always the first call made by the buffer manager to check if a disk
+* There is a system-wide LWLock, the BufMappingLock, that notionally
-page is in memory. If so, the function returns the buffer descriptor and no
+protects the mapping from buffer tags (page identifiers) to buffers.
-further action is required. If the page is not in memory,
+(Physically, it can be thought of as protecting the hash table maintained
-StrategyBufferLookup() returns NULL.
+by buf_table.c.)  To look up whether a buffer exists for a tag, it is
+sufficient to obtain share lock on the BufMappingLock.  Note that one
-The flag recheck tells the strategy that this is a second lookup after
+must pin the found buffer, if any, before releasing the BufMappingLock.
-flushing a dirty block. If the buffer manager has to evict another buffer,
+To alter the page assignment of any buffer, one must hold exclusive lock
-it will release the bufmgr lock while doing the write IO. During this time,
+on the BufMappingLock.  This lock must be held across adjusting the buffer's
-another backend could possibly fault in the same page this backend is after,
+header fields and changing the buf_table hash table.  The only common
-so we have to check again after the IO is done if the page is in memory now.
+operation that needs exclusive lock is reading in a page that was not
+in shared buffers already, which will require at least a kernel call
-*cdb_found_index is set to the index of the found CDB, or -1 if none.
+and usually a wait for I/O, so it will be slow anyway.
-This is not intended to be used by the caller, except to pass to
-StrategyReplaceBuffer().
+* A separate system-wide LWLock, the BufFreelistLock, provides mutual
+exclusion for operations that access the buffer free list or select
-	BufferDesc *StrategyGetBuffer(int *cdb_replace_index)
+buffers for replacement.  This is always taken in exclusive mode since
+there are no read-only operations on those data structures.  The buffer
-The buffer manager calls this function to get an unpinned cache buffer whose
+management policy is designed so that BufFreelistLock need not be taken
-content can be evicted. The returned buffer might be empty, clean or dirty.
+except in paths that will require I/O, and thus will be slow anyway.
+(Details appear below.)  It is never necessary to hold the BufMappingLock
-The returned buffer is only a candidate for replacement.  It is possible that
+and the BufFreelistLock at the same time.
-while the buffer is being written, another backend finds and modifies it, so
-that it is dirty again.  The buffer manager will then have to call
+* Each buffer header contains a spinlock that must be taken when examining
-StrategyGetBuffer() again to ask for another candidate.
+or changing fields of that buffer header.  This allows operations such as
+ReleaseBuffer to make local state changes without taking any system-wide
-*cdb_replace_index is set to the index of the candidate CDB, or -1 if none
+lock.  We use a spinlock, not an LWLock, since there are no cases where
-(meaning we are using a previously free buffer).  This is not intended to be
+the lock needs to be held for more than a few instructions.
-used by the caller, except to pass to StrategyReplaceBuffer().
+Note that a buffer header's spinlock does not control access to the data
-	void StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
+held within the buffer.  Each buffer header also contains an LWLock, the
-	                           int cdb_found_index, int cdb_replace_index)
+"buffer content lock", that *does* represent the right to access the data
+in the buffer.  It is used per the rules above.
-Called by the buffer manager at the time it is about to change the association
-of a buffer with a disk page.
+There is yet another set of per-buffer LWLocks, the io_in_progress locks,
+that are used to wait for I/O on a buffer to complete.  The process doing
-Before this call, StrategyBufferLookup() still has to find the buffer under
+a read or write takes exclusive lock for the duration, and processes that
-its old tag, even if it was returned by StrategyGetBuffer() as a candidate
+need to wait for completion try to take shared locks (which they release
-for replacement.
+immediately upon obtaining).  XXX on systems where an LWLock represents
+nontrivial resources, it's fairly annoying to need so many locks.  Possibly
-After this call, this buffer must be returned for a lookup of the new page
+we could use per-backend LWLocks instead (a buffer header would then contain
-identified by *newTag.
+a field to show which backend is doing its I/O).
-cdb_found_index and cdb_replace_index must be the auxiliary values
-returned by previous calls to StrategyBufferLookup and StrategyGetBuffer.
-	void StrategyInvalidateBuffer(BufferDesc *buf)
-Called by the buffer manager to inform the strategy that the content of this
-buffer is being thrown away. This happens for example in the case of dropping
-a relation.  The buffer must be clean and unpinned on call.
-If the buffer was associated with a disk page, StrategyBufferLookup()
-must not return it for this page after the call.
-	void StrategyHintVacuum(bool vacuum_active)
-Because VACUUM reads all relations of the entire database through the buffer
-manager, it can greatly disturb the buffer replacement strategy. This function
-is used by VACUUM to inform the strategy that subsequent buffer lookups are
-(or are not) caused by VACUUM scanning relations.
 Buffer replacement strategy
 ---------------------------
-The buffer replacement strategy actually used in freelist.c is a version of
+There is a "free list" of buffers that are prime candidates for replacement.
-the Adaptive Replacement Cache (ARC) specially tailored for PostgreSQL.
+In particular, buffers that are completely free (contain no valid page) are
+always in this list.  We may also throw buffers into this list if we
-The algorithm works as follows:
+consider their pages unlikely to be needed soon.  The list is singly-linked
+using fields in the buffer headers; we maintain head and tail pointers in
-C is the size of the cache in number of pages (a/k/a shared_buffers or
+global variables.  (Note: although the list links are in the buffer headers,
-NBuffers).  ARC uses 2*C Cache Directory Blocks (CDB). A cache directory block
+they are considered to be protected by the BufFreelistLock, not the
-is always associated with one unique file page.  It may point to one shared
+buffer-header spinlocks.)  To choose a victim buffer to recycle when there
-buffer, or may indicate that the file page is not in a buffer but has been
+are no free buffers available, we use a simple clock-sweep algorithm, which
-accessed recently.
+avoids the need to take system-wide locks during common operations.  It
+works like this:
-All CDB entries are managed in 4 LRU lists named T1, T2, B1 and B2. The T1 and
-T2 lists are the "real" cache entries, linking a file page to a memory buffer
+Each buffer header contains a usage counter, which is incremented (up to a
-where the page is currently cached. Consequently T1len+T2len <= C. B1 and B2
+small limit value) whenever the buffer is unpinned.  (This requires only the
-are ghost cache directories that extend T1 and T2 so that the strategy
+buffer header spinlock, which would have to be taken anyway to decrement the
-remembers pages longer. The strategy tries to keep B1len+T1len and B2len+T2len
+buffer reference count, so it's nearly free.)
-both at C. T1len and T2len vary over the runtime depending on the lookup
-pattern and its resulting cache hits. The desired size of T1len is called
+The "clock hand" is a buffer index, NextVictimBuffer, that moves circularly
-T1target.
+through all the available buffers.  NextVictimBuffer is protected by the
+BufFreelistLock.
-Assuming we have a full cache, one of 5 cases happens on a lookup:
+The algorithm for a process that needs to obtain a victim buffer is:
-MISS	On a cache miss, depending on T1target and the actual T1len
-	the LRU buffer of either T1 or T2 is evicted. Its CDB is removed
+1. Obtain BufFreelistLock.
-	from the T list and added as MRU of the corresponding B list.
-	The now free buffer is replaced with the requested page
+2. If buffer free list is nonempty, remove its head buffer.  If the buffer
-	and added as MRU of T1.
+is pinned or has a nonzero usage count, it cannot be used; ignore it and
+return to the start of step 2.  Otherwise, pin the buffer, release
-T1 hit	The T1 CDB is moved to the MRU position of the T2 list.
+BufFreelistLock, and return the buffer.
-T2 hit	The T2 CDB is moved to the MRU position of the T2 list.
+3. Otherwise, select the buffer pointed to by NextVictimBuffer, and
+circularly advance NextVictimBuffer for next time.
-B1 hit	This means that a buffer that was evicted from the T1
-	list is now requested again, indicating that T1target is
+4. If the selected buffer is pinned or has a nonzero usage count, it cannot
-	too small (otherwise it would still be in T1 and thus in
+be used.  Decrement its usage count (if nonzero) and return to step 3 to
-	memory). The strategy raises T1target, evicts a buffer
+examine the next buffer.
-	depending on T1target and T1len and places the CDB at
-	MRU of T2.
+5. Pin the selected buffer, release BufFreelistLock, and return the buffer.
-B2 hit	This means the opposite of B1, the T2 list is probably too
+(Note that if the selected buffer is dirty, we will have to write it out
-	small. So the strategy lowers T1target, evicts a buffer
+before we can recycle it; if someone else pins the buffer meanwhile we will
-	and places the CDB at MRU of T2.
+have to give up and try another buffer.  This however is not a concern
+of the basic select-a-victim-buffer algorithm.)
-Thus, every page that is found on lookup in any of the four lists
-ends up as the MRU of the T2 list. The T2 list therefore is the
+A special provision is that while running VACUUM, a backend does not
-"frequency" cache, holding frequently requested pages.
+increment the usage count on buffers it accesses.  In fact, if ReleaseBuffer
+sees that it is dropping the pin count to zero and the usage count is zero,
-Every page that is seen for the first time ends up as the MRU of the T1
+then it appends the buffer to the tail of the free list.  (This implies that
-list. The T1 list is the "recency" cache, holding recent newcomers.
+VACUUM, but only VACUUM, must take the BufFreelistLock during ReleaseBuffer;
+this shouldn't create much of a contention problem.)  This provision
-The tailoring done for PostgreSQL has to do with the way the query executor
+encourages VACUUM to work in a relatively small number of buffers rather
-works. A typical UPDATE or DELETE first scans the relation, searching for the
+than blowing out the entire buffer cache.  It is reasonable since a page
-tuples and then calls heap_update() or heap_delete(). This causes at least 2
+that has been touched only by VACUUM is unlikely to be needed again soon.
-lookups for the block in the same statement. In the case of multiple matches
-in one block even more often. As a result, every block touched in an UPDATE or
-DELETE would directly jump into the T2 cache, which is wrong. To prevent this
-the strategy remembers which transaction added a buffer to the T1 list and
-will not promote it from there into the T2 cache during the same transaction.
-Another specialty is the change of the strategy during VACUUM.  Lookups during
-VACUUM do not represent application needs, and do not suggest that the page
-will be hit again soon, so it would be wrong to change the cache balance
-T1target due to that or to cause massive cache evictions. Therefore, a page
-read in to satisfy vacuum is placed at the LRU position of the T1 list, for
-immediate reuse.  Also, if we happen to get a hit on a CDB entry during
-VACUUM, we do not promote the page above its current position in the list.
 Since VACUUM usually requests many pages very fast, the effect of this is that
 it will get back the very buffers it filled and possibly modified on the next
 call and will therefore do its work in a few shared memory buffers, while
 being able to use whatever it finds in the cache already.  This also implies
 that most of the write traffic caused by a VACUUM will be done by the VACUUM
 itself and not pushed off onto other processes.
+Background writer's processing
+------------------------------
+The background writer is designed to write out pages that are likely to be
+recycled soon, thereby offloading the writing work from active backends.
+To do this, it scans forward circularly from the current position of
+NextVictimBuffer (which it does not change!), looking for buffers that are
+dirty and not pinned nor marked with a positive usage count.  It pins,
+writes, and releases any such buffer.
+If we can assume that reading NextVictimBuffer is an atomic action, then
+the writer doesn't even need to take the BufFreelistLock in order to look
+for buffers to write; it needs only to spinlock each buffer header for long
+enough to check the dirtybit.  Even without that assumption, the writer
+only needs to take the lock long enough to read the variable value, not
+while scanning the buffers.  (This is a very substantial improvement in
+the contention cost of the writer compared to PG 8.0.)
+During a checkpoint, the writer's strategy must be to write every dirty
+buffer (pinned or not!).  We may as well make it start this scan from 
+NextVictimBuffer, however, so that the first-to-be-written pages are the
+ones that backends might otherwise have to write for themselves soon.
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/buf_init.c,v 1.71 2005/02/03 23:29:11 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/buf_init.c,v 1.72 2005/03/04 20:21:06 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -22,6 +22,8 @@ BufferDesc *BufferDescriptors;
 Block	   *BufferBlockPointers;
 int32	   *PrivateRefCount;
+static char *BufferBlocks;
 /* statistics counters */
 long int	ReadBufferCount;
 long int	ReadLocalBufferCount;
@@ -50,16 +52,11 @@ long int	LocalBufferFlushCount;
 *
 * Synchronization/Locking:
 *
- * BufMgrLock lock -- must be acquired before manipulating the
- *		buffer search datastructures (lookup/freelist, as well as the
- *		flag bits of any buffer).  Must be released
- *		before exit and before doing any IO.
- *
 * IO_IN_PROGRESS -- this is a flag in the buffer descriptor.
 *		It must be set when an IO is initiated and cleared at
 *		the end of the IO.	It is there to make sure that one
 *		process doesn't start to use a buffer while another is
- *		faulting it in.  see IOWait/IOSignal.
+ *		faulting it in.  see WaitIO and related routines.
 *
 * refcount --	Counts the number of processes holding pins on a buffer.
 *		A buffer is pinned during IO and immediately after a BufferAlloc().
@@ -85,10 +82,8 @@ long int	LocalBufferFlushCount;
 void
 InitBufferPool(void)
 {
-	char	   *BufferBlocks;
 	bool		foundBufs,
 				foundDescs;
-	int			i;
 	BufferDescriptors = (BufferDesc *)
 		ShmemInitStruct("Buffer Descriptors",
@@ -102,52 +97,42 @@ InitBufferPool(void)
 	{
 		/* both should be present or neither */
 		Assert(foundDescs && foundBufs);
+		/* note: this path is only taken in EXEC_BACKEND case */
 	}
 	else
 	{
 		BufferDesc *buf;
-		char	   *block;
+		int			i;
-		/*
-		 * It's probably not really necessary to grab the lock --- if
-		 * there's anyone else attached to the shmem at this point, we've
-		 * got problems.
-		 */
-		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 		buf = BufferDescriptors;
-		block = BufferBlocks;
 		/*
 		 * Initialize all the buffer headers.
 		 */
-		for (i = 0; i < NBuffers; block += BLCKSZ, buf++, i++)
+		for (i = 0; i < NBuffers; buf++, i++)
 		{
-			Assert(ShmemIsValid((unsigned long) block));
+			CLEAR_BUFFERTAG(buf->tag);
+			buf->flags = 0;
+			buf->usage_count = 0;
+			buf->refcount = 0;
+			buf->wait_backend_id = 0;
-			/*
+			SpinLockInit(&buf->buf_hdr_lock);
-			 * The bufNext fields link together all totally-unused buffers.
-			 * Subsequent management of this list is done by
-			 * StrategyGetBuffer().
-			 */
-			buf->bufNext = i + 1;
-			CLEAR_BUFFERTAG(buf->tag);
 			buf->buf_id = i;
-			buf->data = MAKE_OFFSET(block);
+			/*
-			buf->flags = 0;
+			 * Initially link all the buffers together as unused.
-			buf->refcount = 0;
+			 * Subsequent management of this list is done by freelist.c.
+			 */
+			buf->freeNext = i + 1;
 			buf->io_in_progress_lock = LWLockAssign();
-			buf->cntx_lock = LWLockAssign();
+			buf->content_lock = LWLockAssign();
-			buf->cntxDirty = false;
-			buf->wait_backend_id = 0;
 		}
 		/* Correct last entry of linked list */
-		BufferDescriptors[NBuffers - 1].bufNext = -1;
+		BufferDescriptors[NBuffers - 1].freeNext = FREENEXT_END_OF_LIST;
-		LWLockRelease(BufMgrLock);
 	}
 	/* Init other shared buffer-management stuff */
@@ -162,12 +147,13 @@ InitBufferPool(void)
 * buffer pool.
 *
 * NB: this is called before InitProcess(), so we do not have a PGPROC and
- * cannot do LWLockAcquire; hence we can't actually access the bufmgr's
+ * cannot do LWLockAcquire; hence we can't actually access stuff in
 * shared memory yet.  We are only initializing local data here.
 */
 void
 InitBufferPoolAccess(void)
 {
+	char	   *block;
 	int			i;
 	/*
@@ -179,12 +165,18 @@ InitBufferPoolAccess(void)
 									   sizeof(*PrivateRefCount));
 	/*
-	 * Convert shmem offsets into addresses as seen by this process. This
+	 * Construct addresses for the individual buffer data blocks.  We do
-	 * is just to speed up the BufferGetBlock() macro.  It is OK to do this
+	 * this just to speed up the BufferGetBlock() macro.  (Since the
-	 * without any lock since the data pointers never change.
+	 * addresses should be the same in every backend, we could inherit
+	 * this data from the postmaster --- but in the EXEC_BACKEND case
+	 * that doesn't work.)
 	 */
+	block = BufferBlocks;
 	for (i = 0; i < NBuffers; i++)
-		BufferBlockPointers[i] = (Block) MAKE_PTR(BufferDescriptors[i].data);
+	{
+		BufferBlockPointers[i] = (Block) block;
+		block += BLCKSZ;
+	}
 }
 /*

--- a/src/backend/storage/buffer/buf_table.c
+++ b/src/backend/storage/buffer/buf_table.c
@@ -3,12 +3,9 @@
 * buf_table.c
 *	  routines for mapping BufferTags to buffer indexes.
 *
- * NOTE: this module is called only by freelist.c, and the "buffer IDs"
+ * Note: the routines in this file do no locking of their own.  The caller
- * it deals with are whatever freelist.c needs them to be; they may not be
+ * must hold a suitable lock on the BufMappingLock, as specified in the
- * directly equivalent to Buffer numbers.
+ * comments.
- *
- * Note: all routines in this file assume that the BufMgrLock is held
- * by the caller, so no synchronization is needed.
 *
 *
 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
@@ -16,7 +13,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/buf_table.c,v 1.39 2005/02/03 23:29:11 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/buf_table.c,v 1.40 2005/03/04 20:21:06 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -74,17 +71,17 @@ InitBufTable(int size)
 /*
 * BufTableLookup
 *		Lookup the given BufferTag; return buffer ID, or -1 if not found
+ *
+ * Caller must hold at least share lock on BufMappingLock
 */
 int
 BufTableLookup(BufferTag *tagPtr)
 {
 	BufferLookupEnt *result;
-	if (tagPtr->blockNum == P_NEW)
-		return -1;
 	result = (BufferLookupEnt *)
 		hash_search(SharedBufHash, (void *) tagPtr, HASH_FIND, NULL);
 	if (!result)
 		return -1;
@@ -93,14 +90,23 @@ BufTableLookup(BufferTag *tagPtr)
 /*
 * BufTableInsert
- *		Insert a hashtable entry for given tag and buffer ID
+ *		Insert a hashtable entry for given tag and buffer ID,
+ *		unless an entry already exists for that tag
+ *
+ * Returns -1 on successful insertion.  If a conflicting entry exists
+ * already, returns the buffer ID in that entry.
+ *
+ * Caller must hold write lock on BufMappingLock
 */
-void
+int
 BufTableInsert(BufferTag *tagPtr, int buf_id)
 {
 	BufferLookupEnt *result;
 	bool		found;
+	Assert(buf_id >= 0);		/* -1 is reserved for not-in-table */
+	Assert(tagPtr->blockNum != P_NEW); /* invalid tag */
 	result = (BufferLookupEnt *)
 		hash_search(SharedBufHash, (void *) tagPtr, HASH_ENTER, &found);
@@ -109,15 +115,19 @@ BufTableInsert(BufferTag *tagPtr, int buf_id)
 				(errcode(ERRCODE_OUT_OF_MEMORY),
 				 errmsg("out of shared memory")));
-	if (found)					/* found something already in the table? */
+	if (found)					/* found something already in the table */
-		elog(ERROR, "shared buffer hash table corrupted");
+		return result->id;
 	result->id = buf_id;
+	return -1;
 }
 /*
 * BufTableDelete
 *		Delete the hashtable entry for given tag (which must exist)
+ *
+ * Caller must hold write lock on BufMappingLock
 */
 void
 BufTableDelete(BufferTag *tagPtr)

--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.185 2005/01/10 20:02:21 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.186 2005/03/04 20:21:06 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -25,7 +25,9 @@
 *
 * WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer()
 *
- * BufferSync() -- flush all (or some) dirty buffers in the buffer pool.
+ * BufferSync() -- flush all dirty buffers in the buffer pool.
+ *
+ * BgBufferSync() -- flush some dirty buffers in the buffer pool.
 *
 * InitBufferPool() -- Init the buffer module.
 *
@@ -50,16 +52,22 @@
 #include "pgstat.h"
-#define BufferGetLSN(bufHdr)	\
+/* Note: these two macros only work on shared buffers, not local ones! */
-	(*((XLogRecPtr*) MAKE_PTR((bufHdr)->data)))
+#define BufHdrGetBlock(bufHdr)	BufferBlockPointers[(bufHdr)->buf_id]
+#define BufferGetLSN(bufHdr)	(*((XLogRecPtr*) BufHdrGetBlock(bufHdr)))
+/* Note: this macro only works on local buffers, not shared ones! */
+#define LocalBufHdrGetBlock(bufHdr)	\
+	LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
-/* GUC variable */
+/* GUC variables */
 bool		zero_damaged_pages = false;
+double		bgwriter_lru_percent = 1.0;
+double		bgwriter_all_percent = 0.333;
+int			bgwriter_lru_maxpages = 5;
+int			bgwriter_all_maxpages = 5;
-#ifdef NOT_USED
-bool		ShowPinTrace = false;
-#endif
 long		NDirectFileRead;	/* some I/O's are direct file access.
 								 * bypass bufmgr */
@@ -73,18 +81,18 @@ static bool IsForInput;
 static BufferDesc *PinCountWaitBuf = NULL;
-static void PinBuffer(BufferDesc *buf, bool fixOwner);
+static bool PinBuffer(BufferDesc *buf);
-static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
+static void PinBuffer_Locked(BufferDesc *buf);
+static void UnpinBuffer(BufferDesc *buf, bool fixOwner, bool trashOK);
+static bool SyncOneBuffer(int buf_id, bool skip_pinned);
 static void WaitIO(BufferDesc *buf);
-static void StartBufferIO(BufferDesc *buf, bool forInput);
+static bool StartBufferIO(BufferDesc *buf, bool forInput);
-static void TerminateBufferIO(BufferDesc *buf, int err_flag);
+static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
-static void ContinueBufferIO(BufferDesc *buf, bool forInput);
+							  int set_flag_bits);
 static void buffer_write_error_callback(void *arg);
-static Buffer ReadBufferInternal(Relation reln, BlockNumber blockNum,
-				   bool bufferLockHeld);
 static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
 			bool *foundPtr);
-static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, bool earlylock);
+static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
 static void write_buffer(Buffer buffer, bool unpin);
@@ -105,28 +113,16 @@ static void write_buffer(Buffer buffer, bool unpin);
 */
 Buffer
 ReadBuffer(Relation reln, BlockNumber blockNum)
-{
-	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
-	return ReadBufferInternal(reln, blockNum, false);
-}
-/*
- * ReadBufferInternal -- internal version of ReadBuffer with more options
- *
- * bufferLockHeld: if true, caller already acquired the bufmgr lock.
- * (This is assumed never to be true if dealing with a local buffer!)
- *
- * The caller must have done ResourceOwnerEnlargeBuffers(CurrentResourceOwner)
- */
-static Buffer
-ReadBufferInternal(Relation reln, BlockNumber blockNum,
-				   bool bufferLockHeld)
 {
 	BufferDesc *bufHdr;
+	Block		bufBlock;
 	bool		found;
 	bool		isExtend;
 	bool		isLocalBuf;
+	/* Make sure we will have room to remember the buffer pin */
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 	isExtend = (blockNum == P_NEW);
 	isLocalBuf = reln->rd_istemp;
@@ -137,10 +133,11 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
 	if (isExtend)
 		blockNum = smgrnblocks(reln->rd_smgr);
+	pgstat_count_buffer_read(&reln->pgstat_info, reln);
 	if (isLocalBuf)
 	{
 		ReadLocalBufferCount++;
-		pgstat_count_buffer_read(&reln->pgstat_info, reln);
 		bufHdr = LocalBufferAlloc(reln, blockNum, &found);
 		if (found)
 			LocalBufferHitCount++;
@@ -148,20 +145,17 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
 	else
 	{
 		ReadBufferCount++;
-		pgstat_count_buffer_read(&reln->pgstat_info, reln);
 		/*
 		 * lookup the buffer.  IO_IN_PROGRESS is set if the requested
 		 * block is not currently in memory.
 		 */
-		if (!bufferLockHeld)
-			LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 		bufHdr = BufferAlloc(reln, blockNum, &found);
 		if (found)
 			BufferHitCount++;
 	}
-	/* At this point we do NOT hold the bufmgr lock. */
+	/* At this point we do NOT hold any locks. */
 	/* if it was already in the buffer pool, we're done */
 	if (found)
@@ -187,20 +181,22 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
 	 * same buffer (if it's not been recycled) but come right back here to
 	 * try smgrextend again.
 	 */
-	Assert(!(bufHdr->flags & BM_VALID));
+	Assert(!(bufHdr->flags & BM_VALID)); /* spinlock not needed */
+	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
 	if (isExtend)
 	{
 		/* new buffers are zero-filled */
-		MemSet((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ);
+		MemSet((char *) bufBlock, 0, BLCKSZ);
-		smgrextend(reln->rd_smgr, blockNum, (char *) MAKE_PTR(bufHdr->data),
+		smgrextend(reln->rd_smgr, blockNum, (char *) bufBlock,
 				   reln->rd_istemp);
 	}
 	else
 	{
-		smgrread(reln->rd_smgr, blockNum, (char *) MAKE_PTR(bufHdr->data));
+		smgrread(reln->rd_smgr, blockNum, (char *) bufBlock);
 		/* check for garbage data */
-		if (!PageHeaderIsValid((PageHeader) MAKE_PTR(bufHdr->data)))
+		if (!PageHeaderIsValid((PageHeader) bufBlock))
 		{
 			/*
 			 * During WAL recovery, the first access to any data page
@@ -215,7 +211,7 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
 						(errcode(ERRCODE_DATA_CORRUPTED),
 						 errmsg("invalid page header in block %u of relation \"%s\"; zeroing out page",
 							  blockNum, RelationGetRelationName(reln))));
-				MemSet((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ);
+				MemSet((char *) bufBlock, 0, BLCKSZ);
 			}
 			else
 				ereport(ERROR,
@@ -232,16 +228,8 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
 	}
 	else
 	{
-		/* lock buffer manager again to update IO IN PROGRESS */
+		/* Set BM_VALID, terminate IO, and wake up any waiters */
-		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
+		TerminateBufferIO(bufHdr, false, BM_VALID);
-		/* IO Succeeded, so mark data valid */
-		bufHdr->flags |= BM_VALID;
-		/* If anyone was waiting for IO to complete, wake them up now */
-		TerminateBufferIO(bufHdr, 0);
-		LWLockRelease(BufMgrLock);
 	}
 	if (VacuumCostActive)
@@ -263,8 +251,7 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
 * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
 * we keep it for simplicity in ReadBuffer.
 *
- * BufMgrLock must be held at entry.  When this routine returns,
+ * No locks are held either at entry or exit.
- * the BufMgrLock is guaranteed NOT to be held.
 */
 static BufferDesc *
 BufferAlloc(Relation reln,
@@ -272,229 +259,343 @@ BufferAlloc(Relation reln,
 			bool *foundPtr)
 {
 	BufferTag	newTag;			/* identity of requested block */
-	BufferDesc *buf,
+	BufferTag	oldTag;
-			   *buf2;
+	BufFlags	oldFlags;
-	int			cdb_found_index,
+	int			buf_id;
-				cdb_replace_index;
+	BufferDesc *buf;
-	bool		inProgress;		/* did we already do StartBufferIO? */
+	bool		valid;
 	/* create a tag so we can lookup the buffer */
 	INIT_BUFFERTAG(newTag, reln, blockNum);
 	/* see if the block is in the buffer pool already */
-	buf = StrategyBufferLookup(&newTag, false, &cdb_found_index);
+	LWLockAcquire(BufMappingLock, LW_SHARED);
-	if (buf != NULL)
+	buf_id = BufTableLookup(&newTag);
+	if (buf_id >= 0)
 	{
 		/*
 		 * Found it.  Now, pin the buffer so no one can steal it from the
-		 * buffer pool, and check to see if someone else is still reading
+		 * buffer pool, and check to see if the correct data has been
-		 * data into the buffer.  (Formerly, we'd always block here if
+		 * loaded into the buffer.
-		 * IO_IN_PROGRESS is set, but there's no need to wait when someone
-		 * is writing rather than reading.)
 		 */
-		*foundPtr = TRUE;
+		buf = &BufferDescriptors[buf_id];
-		PinBuffer(buf, true);
+		valid = PinBuffer(buf);
-		if (!(buf->flags & BM_VALID))
+		/* Can release the mapping lock as soon as we've pinned it */
-		{
+		LWLockRelease(BufMappingLock);
-			if (buf->flags & BM_IO_IN_PROGRESS)
+		*foundPtr = TRUE;
+		if (!valid)
 		{
-				/* someone else is reading it, wait for them */
+			/*
-				WaitIO(buf);
+			 * We can only get here if (a) someone else is still reading
-			}
+			 * in the page, or (b) a previous read attempt failed.  We
-			if (!(buf->flags & BM_VALID))
+			 * have to wait for any active read attempt to finish, and
+			 * then set up our own read attempt if the page is still not
+			 * BM_VALID.  StartBufferIO does it all.
+			 */
+			if (StartBufferIO(buf, true))
 			{
 				/*
 				 * If we get here, previous attempts to read the buffer
 				 * must have failed ... but we shall bravely try again.
 				 */
 				*foundPtr = FALSE;
-				StartBufferIO(buf, true);
 			}
 		}
-		LWLockRelease(BufMgrLock);
 		return buf;
 	}
-	*foundPtr = FALSE;
 	/*
 	 * Didn't find it in the buffer pool.  We'll have to initialize a new
-	 * buffer.	First, grab one from the free list.  If it's dirty, flush
+	 * buffer.  Remember to unlock BufMappingLock while doing the work.
-	 * it to disk. Remember to unlock BufMgrLock while doing the IO.
 	 */
-	inProgress = FALSE;
+	LWLockRelease(BufMappingLock);
-	do
-	{
-		buf = StrategyGetBuffer(&cdb_replace_index);
-		/* StrategyGetBuffer will elog if it can't find a free buffer */
-		Assert(buf);
+	/* Loop here in case we have to try another victim buffer */
+	for (;;)
+	{
 		/*
-		 * There should be exactly one pin on the buffer after it is
+		 * Select a victim buffer.  The buffer is returned with its
-		 * allocated -- ours.  If it had a pin it wouldn't have been on
+		 * header spinlock still held!  Also the BufFreelistLock is
-		 * the free list.  No one else could have pinned it between
+		 * still held, since it would be bad to hold the spinlock
-		 * StrategyGetBuffer and here because we have the BufMgrLock.
+		 * while possibly waking up other processes.
-		 *
-		 * (We must pin the buffer before releasing BufMgrLock ourselves,
-		 * to ensure StrategyGetBuffer won't give the same buffer to someone
-		 * else.)
 		 */
+		buf = StrategyGetBuffer();
 		Assert(buf->refcount == 0);
-		buf->refcount = 1;
-		PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 1;
-		ResourceOwnerRememberBuffer(CurrentResourceOwner,
+		/* Must copy buffer flags while we still hold the spinlock */
-									BufferDescriptorGetBuffer(buf));
+		oldFlags = buf->flags;
+		/* Pin the buffer and then release the buffer spinlock */
+		PinBuffer_Locked(buf);
-		if ((buf->flags & BM_VALID) &&
+		/* Now it's safe to release the freelist lock */
-			(buf->flags & BM_DIRTY || buf->cntxDirty))
+		LWLockRelease(BufFreelistLock);
+		/*
+		 * If the buffer was dirty, try to write it out.  There is a race
+		 * condition here, in that someone might dirty it after we released
+		 * it above, or even while we are writing it out (since our share-lock
+		 * won't prevent hint-bit updates).  We will recheck the dirty bit
+		 * after re-locking the buffer header.
+		 */
+		if (oldFlags & BM_DIRTY)
 		{
 			/*
-			 * Set BM_IO_IN_PROGRESS to show the buffer is being written.
+			 * We need a share-lock on the buffer contents to write it out
-			 * It cannot already be set because the buffer would be pinned
+			 * (else we might write invalid data, eg because someone else
-			 * if someone were writing it.
+			 * is compacting the page contents while we write).  We must use
-			 *
+			 * a conditional lock acquisition here to avoid deadlock.  Even
-			 * Note: it's okay to grab the io_in_progress lock while holding
+			 * though the buffer was not pinned (and therefore surely not
-			 * BufMgrLock.	All code paths that acquire this lock pin the
+			 * locked) when StrategyGetBuffer returned it, someone else could
-			 * buffer first; since no one had it pinned (it just came off
+			 * have pinned and exclusive-locked it by the time we get here.
-			 * the free list), no one else can have the lock.
+			 * If we try to get the lock unconditionally, we'd block waiting
+			 * for them; if they later block waiting for us, deadlock ensues.
+			 * (This has been observed to happen when two backends are both
+			 * trying to split btree index pages, and the second one just
+			 * happens to be trying to split the page the first one got from
+			 * StrategyGetBuffer.)
+			 */
+			if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
+			{
+				FlushBuffer(buf, NULL);
+				LWLockRelease(buf->content_lock);
+			}
+			else
+			{
+				/*
+				 * Someone else has pinned the buffer, so give it up and
+				 * loop back to get another one.
 				 */
-			StartBufferIO(buf, false);
+				UnpinBuffer(buf, true, false /* evidently recently used */ );
+				continue;
-			inProgress = TRUE;
+			}
+		}
 		/*
-			 * Write the buffer out, being careful to release BufMgrLock
+		 * Acquire exclusive mapping lock in preparation for changing
-			 * while doing the I/O.  We also tell FlushBuffer to share-lock
+		 * the buffer's association.
-			 * the buffer before releasing BufMgrLock.  This is safe because
+		 */
-			 * we know no other backend currently has the buffer pinned,
+		LWLockAcquire(BufMappingLock, LW_EXCLUSIVE);
-			 * therefore no one can have it locked either, so we can always
-			 * get the lock without blocking.  It is necessary because if
-			 * we release BufMgrLock first, it's possible for someone else
-			 * to pin and exclusive-lock the buffer before we get to the
-			 * share-lock, causing us to block.  If the someone else then
-			 * blocks on a lock we hold, deadlock ensues.  This has been
-			 * observed to happen when two backends are both trying to split
-			 * btree index pages, and the second one just happens to be
-			 * trying to split the page the first one got from the freelist.
-			 */
-			FlushBuffer(buf, NULL, true);
 		/*
-			 * Somebody could have allocated another buffer for the same
+		 * Try to make a hashtable entry for the buffer under its new tag.
-			 * block we are about to read in. While we flush out the dirty
+		 * This could fail because while we were writing someone else
-			 * buffer, we don't hold the lock and someone could have
+		 * allocated another buffer for the same block we want to read in.
-			 * allocated another buffer for the same block. The problem is
+		 * Note that we have not yet removed the hashtable entry for the
-			 * we haven't yet inserted the new tag into the buffer table.
+		 * old tag.
-			 * So we need to check here.		-ay 3/95
-			 *
-			 * Another reason we have to do this is to update
-			 * cdb_found_index, since the CDB could have disappeared from
-			 * B1/B2 list while we were writing.
 		 */
-			buf2 = StrategyBufferLookup(&newTag, true, &cdb_found_index);
+		buf_id = BufTableInsert(&newTag, buf->buf_id);
-			if (buf2 != NULL)
+		if (buf_id >= 0)
 		{
 			/*
-				 * Found it. Someone has already done what we were about
+			 * Got a collision. Someone has already done what we were about
 			 * to do. We'll just handle this as if it were found in
 			 * the buffer pool in the first place.	First, give up the
-				 * buffer we were planning to use.
+			 * buffer we were planning to use.  Don't allow it to be
+			 * thrown in the free list (we don't want to hold both
+			 * global locks at once).
 			 */
-				TerminateBufferIO(buf, 0);
+			UnpinBuffer(buf, true, false);
-				UnpinBuffer(buf, true);
-				buf = buf2;
 			/* remaining code should match code at top of routine */
-				*foundPtr = TRUE;
+			buf = &BufferDescriptors[buf_id];
-				PinBuffer(buf, true);
+			valid = PinBuffer(buf);
-				if (!(buf->flags & BM_VALID))
+			/* Can release the mapping lock as soon as we've pinned it */
-				{
+			LWLockRelease(BufMappingLock);
-					if (buf->flags & BM_IO_IN_PROGRESS)
+			*foundPtr = TRUE;
+			if (!valid)
 			{
-						/* someone else is reading it, wait for them */
+				/*
-						WaitIO(buf);
+				 * We can only get here if (a) someone else is still reading
-					}
+				 * in the page, or (b) a previous read attempt failed.  We
-					if (!(buf->flags & BM_VALID))
+				 * have to wait for any active read attempt to finish, and
+				 * then set up our own read attempt if the page is still not
+				 * BM_VALID.  StartBufferIO does it all.
+				 */
+				if (StartBufferIO(buf, true))
 				{
 					/*
-						 * If we get here, previous attempts to read the
+					 * If we get here, previous attempts to read the buffer
-						 * buffer must have failed ... but we shall
+					 * must have failed ... but we shall bravely try again.
-						 * bravely try again.
 					 */
 					*foundPtr = FALSE;
-						StartBufferIO(buf, true);
 				}
 			}
-				LWLockRelease(BufMgrLock);
 			return buf;
 		}
 		/*
-			 * Somebody could have pinned the buffer while we were doing
+		 * Need to lock the buffer header too in order to change its tag.
-			 * the I/O and had given up the BufMgrLock.  If so, we can't
-			 * recycle this buffer --- we need to clear the I/O flags,
-			 * remove our pin and choose a new victim buffer.  Similarly,
-			 * we have to start over if somebody re-dirtied the buffer.
 		 */
-			if (buf->refcount > 1 || buf->flags & BM_DIRTY || buf->cntxDirty)
+		LockBufHdr_NoHoldoff(buf);
-			{
-				TerminateBufferIO(buf, 0);
-				UnpinBuffer(buf, true);
-				inProgress = FALSE;
-				buf = NULL;
-			}
-		}
-	} while (buf == NULL);
 		/*
-	 * At this point we should have the sole pin on a non-dirty buffer and
+		 * Somebody could have pinned or re-dirtied the buffer while we were
-	 * we may or may not already have the BM_IO_IN_PROGRESS flag set.
+		 * doing the I/O and making the new hashtable entry.  If so, we
+		 * can't recycle this buffer; we must undo everything we've done and
+		 * start over with a new victim buffer.
 		 */
+		if (buf->refcount == 1 && !(buf->flags & BM_DIRTY))
+			break;
+		UnlockBufHdr_NoHoldoff(buf);
+		BufTableDelete(&newTag);
+		LWLockRelease(BufMappingLock);
+		UnpinBuffer(buf, true, false /* evidently recently used */ );
+	}
 	/*
-	 * Tell the buffer replacement strategy that we are replacing the
+	 * Okay, it's finally safe to rename the buffer.
-	 * buffer content. Then rename the buffer.	Clearing BM_VALID here is
+	 *
-	 * necessary, clearing the dirtybits is just paranoia.
+	 * Clearing BM_VALID here is necessary, clearing the dirtybits
+	 * is just paranoia.  We also clear the usage_count since any
+	 * recency of use of the old content is no longer relevant.
 	 */
-	StrategyReplaceBuffer(buf, &newTag, cdb_found_index, cdb_replace_index);
+	oldTag = buf->tag;
+	oldFlags = buf->flags;
 	buf->tag = newTag;
 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
-	buf->cntxDirty = false;
+	buf->flags |= BM_TAG_VALID;
+	buf->usage_count = 0;
+	UnlockBufHdr_NoHoldoff(buf);
+	if (oldFlags & BM_TAG_VALID)
+		BufTableDelete(&oldTag);
+	LWLockRelease(BufMappingLock);
 	/*
-	 * Buffer contents are currently invalid.  Have to mark IO IN PROGRESS
+	 * Buffer contents are currently invalid.  Try to get the io_in_progress
-	 * so no one fiddles with them until the read completes.  We may have
+	 * lock.  If StartBufferIO returns false, then someone else managed
-	 * already marked it, in which case we just flip from write to read
+	 * to read it before we did, so there's nothing left for BufferAlloc()
-	 * status.
+	 * to do.
 	 */
-	if (!inProgress)
+	if (StartBufferIO(buf, true))
-		StartBufferIO(buf, true);
+		*foundPtr = FALSE;
 	else
-		ContinueBufferIO(buf, true);
+		*foundPtr = TRUE;
-	LWLockRelease(BufMgrLock);
 	return buf;
 }
+/*
+ * InvalidateBuffer -- mark a shared buffer invalid and return it to the
+ * freelist.
+ *
+ * The buffer header spinlock must be held at entry.  We drop it before
+ * returning.  (This is sane because the caller must have locked the
+ * buffer in order to be sure it should be dropped.)
+ *
+ * This is used only in contexts such as dropping a relation.  We assume
+ * that no other backend could possibly be interested in using the page,
+ * so the only reason the buffer might be pinned is if someone else is
+ * trying to write it out.  We have to let them finish before we can
+ * reclaim the buffer.
+ *
+ * The buffer could get reclaimed by someone else while we are waiting
+ * to acquire the necessary locks; if so, don't mess it up.
+ */
+static void
+InvalidateBuffer(BufferDesc *buf)
+{
+	BufferTag	oldTag;
+	BufFlags	oldFlags;
+	/* Save the original buffer tag before dropping the spinlock */
+	oldTag = buf->tag;
+	UnlockBufHdr(buf);
+retry:
+	/*
+	 * Acquire exclusive mapping lock in preparation for changing
+	 * the buffer's association.
+	 */
+	LWLockAcquire(BufMappingLock, LW_EXCLUSIVE);
+	/* Re-lock the buffer header (NoHoldoff since we have an LWLock) */
+	LockBufHdr_NoHoldoff(buf);
+	/* If it's changed while we were waiting for lock, do nothing */
+	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
+	{
+		UnlockBufHdr_NoHoldoff(buf);
+		LWLockRelease(BufMappingLock);
+		return;
+	}
+	/*
+	 * We assume the only reason for it to be pinned is that someone else
+	 * is flushing the page out.  Wait for them to finish.  (This could be
+	 * an infinite loop if the refcount is messed up... it would be nice
+	 * to time out after awhile, but there seems no way to be sure how
+	 * many loops may be needed.  Note that if the other guy has pinned
+	 * the buffer but not yet done StartBufferIO, WaitIO will fall through
+	 * and we'll effectively be busy-looping here.)
+	 */
+	if (buf->refcount != 0)
+	{
+		UnlockBufHdr_NoHoldoff(buf);
+		LWLockRelease(BufMappingLock);
+		WaitIO(buf);
+		goto retry;
+	}
+	/*
+	 * Clear out the buffer's tag and flags.  We must do this to ensure
+	 * that linear scans of the buffer array don't think the buffer is valid.
+	 */
+	oldFlags = buf->flags;
+	CLEAR_BUFFERTAG(buf->tag);
+	buf->flags = 0;
+	buf->usage_count = 0;
+	UnlockBufHdr_NoHoldoff(buf);
+	/*
+	 * Remove the buffer from the lookup hashtable, if it was in there.
+	 */
+	if (oldFlags & BM_TAG_VALID)
+		BufTableDelete(&oldTag);
+	/*
+	 * Avoid accepting a cancel interrupt when we release the mapping lock;
+	 * that would leave the buffer free but not on the freelist.  (Which would
+	 * not be fatal, since it'd get picked up again by the clock scanning
+	 * code, but we'd rather be sure it gets to the freelist.)
+	 */
+	HOLD_INTERRUPTS();
+	LWLockRelease(BufMappingLock);
+	/*
+	 * Insert the buffer at the head of the list of free buffers.
+	 */
+	StrategyFreeBuffer(buf, true);
+	RESUME_INTERRUPTS();
+}
 /*
 * write_buffer -- common functionality for
 *				   WriteBuffer and WriteNoReleaseBuffer
 */
 static void
-write_buffer(Buffer buffer, bool release)
+write_buffer(Buffer buffer, bool unpin)
 {
 	BufferDesc *bufHdr;
@@ -503,7 +604,7 @@ write_buffer(Buffer buffer, bool release)
 	if (BufferIsLocal(buffer))
 	{
-		WriteLocalBuffer(buffer, release);
+		WriteLocalBuffer(buffer, unpin);
 		return;
 	}
@@ -511,7 +612,8 @@ write_buffer(Buffer buffer, bool release)
 	Assert(PrivateRefCount[buffer - 1] > 0);
-	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
+	LockBufHdr(bufHdr);
 	Assert(bufHdr->refcount > 0);
 	/*
@@ -522,9 +624,10 @@ write_buffer(Buffer buffer, bool release)
 	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
-	if (release)
+	UnlockBufHdr(bufHdr);
-		UnpinBuffer(bufHdr, true);
-	LWLockRelease(BufMgrLock);
+	if (unpin)
+		UnpinBuffer(bufHdr, true, true);
 }
 /*
@@ -555,21 +658,16 @@ WriteNoReleaseBuffer(Buffer buffer)
 /*
 * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
- *		to save a lock release/acquire.
 *
- * Also, if the passed buffer is valid and already contains the desired block
+ * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
- * number, we simply return it without ever acquiring the lock at all.
+ * compared to calling the two routines separately.  Now it's mainly just
- * Since the passed buffer must be pinned, it's OK to examine its block
+ * a convenience function.  However, if the passed buffer is valid and
- * number without getting the lock first.
+ * already contains the desired block, we just return it as-is; and that
+ * does save considerable work compared to a full release and reacquire.
 *
 * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
 * buffer actually needs to be released.  This case is the same as ReadBuffer,
 * but can save some tests in the caller.
- *
- * Also note: while it will work to call this routine with blockNum == P_NEW,
- * it's best to avoid doing so, since that would result in calling
- * smgrnblocks() while holding the bufmgr lock, hence some loss of
- * concurrency.
 */
 Buffer
 ReleaseAndReadBuffer(Buffer buffer,
@@ -588,235 +686,313 @@ ReleaseAndReadBuffer(Buffer buffer,
 				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node))
 				return buffer;
 			ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
-			/* owner now has a free slot, so no need for Enlarge() */
 			LocalRefCount[-buffer - 1]--;
+			if (LocalRefCount[-buffer - 1] == 0 &&
+				bufHdr->usage_count < BM_MAX_USAGE_COUNT)
+				bufHdr->usage_count++;
 		}
 		else
 		{
 			Assert(PrivateRefCount[buffer - 1] > 0);
 			bufHdr = &BufferDescriptors[buffer - 1];
+			/* we have pin, so it's ok to examine tag without spinlock */
 			if (bufHdr->tag.blockNum == blockNum &&
 				RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node))
 				return buffer;
-			ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
+			UnpinBuffer(bufHdr, true, true);
-			/* owner now has a free slot, so no need for Enlarge() */
-			if (PrivateRefCount[buffer - 1] > 1)
-				PrivateRefCount[buffer - 1]--;
-			else
-			{
-				LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-				UnpinBuffer(bufHdr, false);
-				return ReadBufferInternal(relation, blockNum, true);
 		}
 	}
-	}
-	else
-		ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
-	return ReadBufferInternal(relation, blockNum, false);
+	return ReadBuffer(relation, blockNum);
 }
 /*
 * PinBuffer -- make buffer unavailable for replacement.
 *
 * This should be applied only to shared buffers, never local ones.
- * Bufmgr lock must be held by caller.
 *
- * Most but not all callers want CurrentResourceOwner to be adjusted.
 * Note that ResourceOwnerEnlargeBuffers must have been done already.
+ *
+ * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
+ * some callers to avoid an extra spinlock cycle.
+ */
+static bool
+PinBuffer(BufferDesc *buf)
+{
+	int			b = buf->buf_id;
+	bool		result;
+	if (PrivateRefCount[b] == 0)
+	{
+		/*
+		 * Use NoHoldoff here because we don't want the unlock to be a
+		 * potential place to honor a QueryCancel request.
+		 * (The caller should be holding off interrupts anyway.)
+		 */
+		LockBufHdr_NoHoldoff(buf);
+		buf->refcount++;
+		result = (buf->flags & BM_VALID) != 0;
+		UnlockBufHdr_NoHoldoff(buf);
+	}
+	else
+	{
+		/* If we previously pinned the buffer, it must surely be valid */
+		result = true;
+	}
+	PrivateRefCount[b]++;
+	Assert(PrivateRefCount[b] > 0);
+	ResourceOwnerRememberBuffer(CurrentResourceOwner,
+								BufferDescriptorGetBuffer(buf));
+	return result;
+}
+/*
+ * PinBuffer_Locked -- as above, but caller already locked the buffer header.
+ * The spinlock is released before return.
+ *
+ * Note: use of this routine is frequently mandatory, not just an optimization
+ * to save a spin lock/unlock cycle, because we need to pin a buffer before
+ * its state can change under us.
 */
 static void
-PinBuffer(BufferDesc *buf, bool fixOwner)
+PinBuffer_Locked(BufferDesc *buf)
 {
-	int			b = BufferDescriptorGetBuffer(buf) - 1;
+	int			b = buf->buf_id;
 	if (PrivateRefCount[b] == 0)
 		buf->refcount++;
+	/* NoHoldoff since we mustn't accept cancel interrupt here */
+	UnlockBufHdr_NoHoldoff(buf);
 	PrivateRefCount[b]++;
 	Assert(PrivateRefCount[b] > 0);
-	if (fixOwner)
 	ResourceOwnerRememberBuffer(CurrentResourceOwner,
 								BufferDescriptorGetBuffer(buf));
+	/* Now we can accept cancel */
+	RESUME_INTERRUPTS();
 }
 /*
 * UnpinBuffer -- make buffer available for replacement.
 *
 * This should be applied only to shared buffers, never local ones.
- * Bufmgr lock must be held by caller.
 *
 * Most but not all callers want CurrentResourceOwner to be adjusted.
+ *
+ * If we are releasing a buffer during VACUUM, and it's not been otherwise
+ * used recently, and trashOK is true, send the buffer to the freelist.
 */
 static void
-UnpinBuffer(BufferDesc *buf, bool fixOwner)
+UnpinBuffer(BufferDesc *buf, bool fixOwner, bool trashOK)
 {
-	int			b = BufferDescriptorGetBuffer(buf) - 1;
+	int			b = buf->buf_id;
 	if (fixOwner)
 		ResourceOwnerForgetBuffer(CurrentResourceOwner,
 								  BufferDescriptorGetBuffer(buf));
-	Assert(buf->refcount > 0);
 	Assert(PrivateRefCount[b] > 0);
 	PrivateRefCount[b]--;
 	if (PrivateRefCount[b] == 0)
 	{
-		buf->refcount--;
+		bool	trash_buffer = false;
 		/* I'd better not still hold any locks on the buffer */
-		Assert(!LWLockHeldByMe(buf->cntx_lock));
+		Assert(!LWLockHeldByMe(buf->content_lock));
 		Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
+		/* NoHoldoff ensures we don't lose control before sending signal */
+		LockBufHdr_NoHoldoff(buf);
+		/* Decrement the shared reference count */
+		Assert(buf->refcount > 0);
+		buf->refcount--;
+		/* Mark the buffer recently used, unless we are in VACUUM */
+		if (!strategy_hint_vacuum)
+		{
+			if (buf->usage_count < BM_MAX_USAGE_COUNT)
+				buf->usage_count++;
 		}
+		else if (trashOK && 
+				 buf->refcount == 0 &&
+				 buf->usage_count == 0)
+			trash_buffer = true;
-	if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
+		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
 			buf->refcount == 1)
 		{
 			/* we just released the last pin other than the waiter's */
+			BackendId	wait_backend_id = buf->wait_backend_id;
 			buf->flags &= ~BM_PIN_COUNT_WAITER;
-		ProcSendSignal(buf->wait_backend_id);
+			UnlockBufHdr_NoHoldoff(buf);
+			ProcSendSignal(wait_backend_id);
 		}
 		else
-	{
+			UnlockBufHdr_NoHoldoff(buf);
-		/* do nothing */
+		/*
+		 * If VACUUM is releasing an otherwise-unused buffer, send it to
+		 * the freelist for near-term reuse.  We put it at the tail so that
+		 * it won't be used before any invalid buffers that may exist.
+		 */
+		if (trash_buffer)
+			StrategyFreeBuffer(buf, false);
 	}
 }
 /*
- * BufferSync -- Write out dirty buffers in the pool.
+ * BufferSync -- Write out all dirty buffers in the pool.
 *
- * This is called at checkpoint time to write out all dirty shared buffers,
+ * This is called at checkpoint time to write out all dirty shared buffers.
- * and by the background writer process to write out some of the dirty blocks.
- * percent/maxpages should be -1 in the former case, and limit values (>= 0)
- * in the latter.
- *
- * Returns the number of buffers written.
 */
-int
+void
-BufferSync(int percent, int maxpages)
+BufferSync(void)
 {
-	BufferDesc **dirty_buffers;
+	int			buf_id;
-	BufferTag  *buftags;
+	int			num_to_scan;
-	int			num_buffer_dirty;
-	int			i;
-	/* If either limit is zero then we are disabled from doing anything... */
-	if (percent == 0 || maxpages == 0)
-		return 0;
 	/*
-	 * Get a list of all currently dirty buffers and how many there are.
+	 * Find out where to start the circular scan.
-	 * We do not flush buffers that get dirtied after we started. They
-	 * have to wait until the next checkpoint.
 	 */
-	dirty_buffers = (BufferDesc **) palloc(NBuffers * sizeof(BufferDesc *));
+	buf_id = StrategySyncStart();
-	buftags = (BufferTag *) palloc(NBuffers * sizeof(BufferTag));
-	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
+	/* Make sure we can handle the pin inside SyncOneBuffer */
-	num_buffer_dirty = StrategyDirtyBufferList(dirty_buffers, buftags,
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
-											   NBuffers);
 	/*
-	 * If called by the background writer, we are usually asked to only
+	 * Loop over all buffers.
-	 * write out some portion of dirty buffers now, to prevent the IO
-	 * storm at checkpoint time.
 	 */
-	if (percent > 0)
+	num_to_scan = NBuffers;
+	while (num_to_scan-- > 0)
 	{
-		Assert(percent <= 100);
+		(void) SyncOneBuffer(buf_id, false);
-		num_buffer_dirty = (num_buffer_dirty * percent + 99) / 100;
+		if (++buf_id >= NBuffers)
+			buf_id = 0;
 	}
-	if (maxpages > 0 && num_buffer_dirty > maxpages)
+}
-		num_buffer_dirty = maxpages;
-	/* Make sure we can handle the pin inside the loop */
+/*
+ * BgBufferSync -- Write out some dirty buffers in the pool.
+ *
+ * This is called periodically by the background writer process.
+ */
+void
+BgBufferSync(void)
+{
+	static int	buf_id1 = 0;
+	int			buf_id2;
+	int			num_to_scan;
+	int			num_written;
+	/* Make sure we can handle the pin inside SyncOneBuffer */
 	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 	/*
-	 * Loop over buffers to be written.  Note the BufMgrLock is held at
+	 * To minimize work at checkpoint time, we want to try to keep all the
-	 * loop top, but is released and reacquired within FlushBuffer, so we
+	 * buffers clean; this motivates a scan that proceeds sequentially through
-	 * aren't holding it long.
+	 * all buffers.  But we are also charged with ensuring that buffers that
+	 * will be recycled soon are clean when needed; these buffers are the
+	 * ones just ahead of the StrategySyncStart point.  We make a separate
+	 * scan through those.
 	 */
-	for (i = 0; i < num_buffer_dirty; i++)
-	{
-		BufferDesc *bufHdr = dirty_buffers[i];
 	/*
-		 * Check it is still the same page and still needs writing.
+	 * This loop runs over all buffers, including pinned ones.  The
-		 *
+	 * starting point advances through the buffer pool on successive calls.
-		 * We can check bufHdr->cntxDirty here *without* holding any lock on
-		 * buffer context as long as we set this flag in access methods
-		 * *before* logging changes with XLogInsert(): if someone will set
-		 * cntxDirty just after our check we don't worry because of our
-		 * checkpoint.redo points before log record for upcoming changes
-		 * and so we are not required to write such dirty buffer.
 	 */
-		if (!(bufHdr->flags & BM_VALID))
+	if (bgwriter_all_percent > 0.0 && bgwriter_all_maxpages > 0)
-			continue;
+	{
-		if (!BUFFERTAGS_EQUAL(bufHdr->tag, buftags[i]))
+		num_to_scan = (int) ((NBuffers * bgwriter_all_percent + 99) / 100);
-			continue;
+		num_written = 0;
-		if (!(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty))
-			continue;
-		/*
+		while (num_to_scan-- > 0)
-		 * IO synchronization. Note that we do it with unpinned buffer to
-		 * avoid conflicts with FlushRelationBuffers.
-		 */
-		if (bufHdr->flags & BM_IO_IN_PROGRESS)
 		{
-			WaitIO(bufHdr);
+			if (SyncOneBuffer(buf_id1, false))
-			/* Still need writing? */
+				num_written++;
-			if (!(bufHdr->flags & BM_VALID))
+			if (++buf_id1 >= NBuffers)
-				continue;
+				buf_id1 = 0;
-			if (!BUFFERTAGS_EQUAL(bufHdr->tag, buftags[i]))
+			if (num_written >= bgwriter_all_maxpages)
-				continue;
+				break;
-			if (!(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty))
+		}
-				continue;
 	}
 	/*
-		 * Here: no one doing IO for this buffer and it's dirty. Pin
+	 * This loop considers only unpinned buffers close to the clock sweep
-		 * buffer now and set IO state for it *before* acquiring shlock to
+	 * point.
-		 * avoid conflicts with FlushRelationBuffers.
 	 */
-		PinBuffer(bufHdr, true);
+	if (bgwriter_lru_percent > 0.0 && bgwriter_lru_maxpages > 0)
-		StartBufferIO(bufHdr, false);
+	{
+		num_to_scan = (int) ((NBuffers * bgwriter_lru_percent + 99) / 100);
+		num_written = 0;
-		FlushBuffer(bufHdr, NULL, false);
+		buf_id2 = StrategySyncStart();
-		TerminateBufferIO(bufHdr, 0);
+		while (num_to_scan-- > 0)
-		UnpinBuffer(bufHdr, true);
+		{
+			if (SyncOneBuffer(buf_id2, true))
+				num_written++;
+			if (++buf_id2 >= NBuffers)
+				buf_id2 = 0;
+			if (num_written >= bgwriter_lru_maxpages)
+				break;
+		}
 	}
-	LWLockRelease(BufMgrLock);
-	pfree(dirty_buffers);
-	pfree(buftags);
-	return num_buffer_dirty;
 }
 /*
- * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
+ * SyncOneBuffer -- process a single buffer during syncing.
+ *
+ * If skip_pinned is true, we don't write currently-pinned buffers, nor
+ * buffers marked recently used, as these are not replacement candidates.
 *
- * Should be entered with buffer manager lock held; releases it before
+ * Returns true if buffer was written, else false.  (This could be in error
- * waiting and re-acquires it afterwards.
+ * if FlushBuffers finds the buffer clean after locking it, but we don't
+ * care all that much.)
+ *
+ * Note: caller must have done ResourceOwnerEnlargeBuffers.
 */
-static void
+static bool
-WaitIO(BufferDesc *buf)
+SyncOneBuffer(int buf_id, bool skip_pinned)
 {
+	BufferDesc *bufHdr = &BufferDescriptors[buf_id];
 	/*
-	 * Changed to wait until there's no IO - Inoue 01/13/2000
+	 * Check whether buffer needs writing.
 	 *
-	 * Note this is *necessary* because an error abort in the process doing
+	 * We can make this check without taking the buffer content lock
-	 * I/O could release the io_in_progress_lock prematurely. See
+	 * so long as we mark pages dirty in access methods *before* logging
-	 * AbortBufferIO.
+	 * changes with XLogInsert(): if someone marks the buffer dirty
+	 * just after our check we don't worry because our checkpoint.redo
+	 * points before log record for upcoming changes and so we are not
+	 * required to write such dirty buffer.
 	 */
-	while ((buf->flags & BM_IO_IN_PROGRESS) != 0)
+	LockBufHdr(bufHdr);
+	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
 	{
-		LWLockRelease(BufMgrLock);
+		UnlockBufHdr(bufHdr);
-		LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
+		return false;
-		LWLockRelease(buf->io_in_progress_lock);
-		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 	}
+	if (skip_pinned &&
+		(bufHdr->refcount != 0 || bufHdr->usage_count != 0))
+	{
+		UnlockBufHdr(bufHdr);
+		return false;
+	}
+	/*
+	 * Pin it, share-lock it, write it.  (FlushBuffer will do nothing
+	 * if the buffer is clean by the time we've locked it.)
+	 */
+	PinBuffer_Locked(bufHdr);
+	LWLockAcquire(bufHdr->content_lock, LW_SHARED);
+	FlushBuffer(bufHdr, NULL);
+	LWLockRelease(bufHdr->content_lock);
+	UnpinBuffer(bufHdr, true, false /* don't change freelist */ );
+	return true;
 }
@@ -888,6 +1064,9 @@ AtEOXact_Buffers(bool isCommit)
 	AtEOXact_LocalBuffers(isCommit);
 #endif
+	/* Make sure we reset the strategy hint in case VACUUM errored out */
+	StrategyHintVacuum(false);
 }
 /*
@@ -912,9 +1091,7 @@ AtProcExit_Buffers(void)
 			 * here, it suggests that ResourceOwners are messed up.
 			 */
 			PrivateRefCount[i] = 1;		/* make sure we release shared pin */
-			LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
+			UnpinBuffer(buf, false, false /* don't change freelist */ );
-			UnpinBuffer(buf, false);
-			LWLockRelease(BufMgrLock);
 			Assert(PrivateRefCount[i] == 0);
 		}
 	}
@@ -941,6 +1118,7 @@ PrintBufferLeakWarning(Buffer buffer)
 		loccount = PrivateRefCount[buffer - 1];
 	}
+	/* theoretically we should lock the bufhdr here */
 	elog(WARNING,
 		 "buffer refcount leak: [%03d] "
 		 "(rel=%u/%u/%u, blockNum=%u, flags=0x%x, refcount=%u %d)",
@@ -961,7 +1139,7 @@ PrintBufferLeakWarning(Buffer buffer)
 void
 FlushBufferPool(void)
 {
-	BufferSync(-1, -1);
+	BufferSync();
 	smgrsync();
 }
@@ -988,12 +1166,17 @@ BufmgrCommit(void)
 BlockNumber
 BufferGetBlockNumber(Buffer buffer)
 {
+	BufferDesc *bufHdr;
 	Assert(BufferIsPinned(buffer));
 	if (BufferIsLocal(buffer))
-		return LocalBufferDescriptors[-buffer - 1].tag.blockNum;
+		bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
 	else
-		return BufferDescriptors[buffer - 1].tag.blockNum;
+		bufHdr = &BufferDescriptors[buffer - 1];
+	/* pinned, so OK to read tag without spinlock */
+	return bufHdr->tag.blockNum;
 }
 /*
@@ -1013,7 +1196,7 @@ BufferGetFileNode(Buffer buffer)
 	else
 		bufHdr = &BufferDescriptors[buffer - 1];
-	return (bufHdr->tag.rnode);
+	return bufHdr->tag.rnode;
 }
 /*
@@ -1026,41 +1209,28 @@ BufferGetFileNode(Buffer buffer)
 * However, we will need to force the changes to disk via fsync before
 * we can checkpoint WAL.
 *
- * BufMgrLock must be held at entry, and the buffer must be pinned.  The
+ * The caller must hold a pin on the buffer and have share-locked the
- * caller is also responsible for doing StartBufferIO/TerminateBufferIO.
+ * buffer contents.  (Note: a share-lock does not prevent updates of
+ * hint bits in the buffer, so the page could change while the write
+ * is in progress, but we assume that that will not invalidate the data
+ * written.)
 *
 * If the caller has an smgr reference for the buffer's relation, pass it
- * as the second parameter.  If not, pass NULL.  (Do not open relation
+ * as the second parameter.  If not, pass NULL.
- * while holding BufMgrLock!)
- *
- * When earlylock is TRUE, we grab the per-buffer sharelock before releasing
- * BufMgrLock, rather than after.  Normally this would be a bad idea since
- * we might deadlock, but it is safe and necessary when called from
- * BufferAlloc() --- see comments therein.
 */
 static void
-FlushBuffer(BufferDesc *buf, SMgrRelation reln, bool earlylock)
+FlushBuffer(BufferDesc *buf, SMgrRelation reln)
 {
-	Buffer		buffer = BufferDescriptorGetBuffer(buf);
 	XLogRecPtr	recptr;
 	ErrorContextCallback errcontext;
-	/* Transpose cntxDirty into flags while holding BufMgrLock */
-	buf->cntxDirty = false;
-	buf->flags |= BM_DIRTY;
-	/* To check if block content changed while flushing. - vadim 01/17/97 */
-	buf->flags &= ~BM_JUST_DIRTIED;
 	/*
-	 * If earlylock, grab buffer sharelock before anyone else could re-lock
+	 * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
-	 * the buffer.
+	 * false, then someone else flushed the buffer before we could, so
+	 * we need not do anything.
 	 */
-	if (earlylock)
+	if (!StartBufferIO(buf, false))
-		LockBuffer(buffer, BUFFER_LOCK_SHARE);
+		return;
-	/* Release BufMgrLock while doing xlog work */
-	LWLockRelease(BufMgrLock);
 	/* Setup error traceback support for ereport() */
 	errcontext.callback = buffer_write_error_callback;
@@ -1068,20 +1238,12 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, bool earlylock)
 	errcontext.previous = error_context_stack;
 	error_context_stack = &errcontext;
-	/* Find smgr relation for buffer while holding minimal locks */
+	/* Find smgr relation for buffer */
 	if (reln == NULL)
 		reln = smgropen(buf->tag.rnode);
 	/*
-	 * Protect buffer content against concurrent update.  (Note that
+	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
-	 * hint-bit updates can still occur while the write is in progress,
-	 * but we assume that that will not invalidate the data written.)
-	 */
-	if (!earlylock)
-		LockBuffer(buffer, BUFFER_LOCK_SHARE);
-	/*
-	 * Force XLOG flush for buffer' LSN.  This implements the basic WAL
 	 * rule that log updates must hit disk before any of the data-file
 	 * changes they describe do.
 	 */
@@ -1090,35 +1252,30 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, bool earlylock)
 	/*
 	 * Now it's safe to write buffer to disk. Note that no one else should
-	 * have been able to write it while we were busy with locking and log
+	 * have been able to write it while we were busy with log flushing
-	 * flushing because caller has set the IO flag.
+	 * because we have the io_in_progress lock.
-	 *
-	 * It would be better to clear BM_JUST_DIRTIED right here, but we'd have
-	 * to reacquire the BufMgrLock and it doesn't seem worth it.
 	 */
+	/* To check if block content changes while flushing. - vadim 01/17/97 */
+	LockBufHdr_NoHoldoff(buf);
+	buf->flags &= ~BM_JUST_DIRTIED;
+	UnlockBufHdr_NoHoldoff(buf);
 	smgrwrite(reln,
 			  buf->tag.blockNum,
-			  (char *) MAKE_PTR(buf->data),
+			  (char *) BufHdrGetBlock(buf),
 			  false);
-	/* Pop the error context stack */
-	error_context_stack = errcontext.previous;
-	/*
-	 * Release the per-buffer readlock, reacquire BufMgrLock.
-	 */
-	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
-	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 	BufferFlushCount++;
 	/*
-	 * If this buffer was marked by someone as DIRTY while we were
+	 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set)
-	 * flushing it out we must not clear DIRTY flag - vadim 01/17/97
+	 * and end the io_in_progress state.
 	 */
-	if (!(buf->flags & BM_JUST_DIRTIED))
+	TerminateBufferIO(buf, true, 0);
-		buf->flags &= ~BM_DIRTY;
+	/* Pop the error context stack */
+	error_context_stack = errcontext.previous;
 }
 /*
@@ -1210,62 +1367,24 @@ DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
 						 bufHdr->tag.rnode.dbNode,
 						 bufHdr->tag.rnode.relNode,
 						 LocalRefCount[i]);
-				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
+				CLEAR_BUFFERTAG(bufHdr->tag);
-				bufHdr->cntxDirty = false;
+				bufHdr->flags = 0;
-				bufHdr->tag.rnode.relNode = InvalidOid;
+				bufHdr->usage_count = 0;
 			}
 		}
 		return;
 	}
-	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
+	for (i = 0; i < NBuffers; i++)
-	for (i = 1; i <= NBuffers; i++)
 	{
-		bufHdr = &BufferDescriptors[i - 1];
+		bufHdr = &BufferDescriptors[i];
-recheck:
+		LockBufHdr(bufHdr);
 		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
 			bufHdr->tag.blockNum >= firstDelBlock)
-		{
+			InvalidateBuffer(bufHdr);		/* releases spinlock */
-			/*
+		else
-			 * If there is I/O in progress, better wait till it's done;
+			UnlockBufHdr(bufHdr);
-			 * don't want to delete the relation out from under someone
-			 * who's just trying to flush the buffer!
-			 */
-			if (bufHdr->flags & BM_IO_IN_PROGRESS)
-			{
-				WaitIO(bufHdr);
-				/*
-				 * By now, the buffer very possibly belongs to some other
-				 * rel, so check again before proceeding.
-				 */
-				goto recheck;
-			}
-			/*
-			 * There should be no pin on the buffer.
-			 */
-			if (bufHdr->refcount != 0)
-				elog(ERROR, "block %u of %u/%u/%u is still referenced (private %d, global %u)",
-					 bufHdr->tag.blockNum,
-					 bufHdr->tag.rnode.spcNode,
-					 bufHdr->tag.rnode.dbNode,
-					 bufHdr->tag.rnode.relNode,
-					 PrivateRefCount[i - 1], bufHdr->refcount);
-			/* Now we can do what we came for */
-			bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
-			bufHdr->cntxDirty = false;
-			/*
-			 * And mark the buffer as no longer occupied by this rel.
-			 */
-			StrategyInvalidateBuffer(bufHdr);
-		}
 	}
-	LWLockRelease(BufMgrLock);
 }
 /* ---------------------------------------------------------------------
@@ -1285,47 +1404,20 @@ DropBuffers(Oid dbid)
 	int			i;
 	BufferDesc *bufHdr;
-	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-	for (i = 1; i <= NBuffers; i++)
-	{
-		bufHdr = &BufferDescriptors[i - 1];
-recheck:
-		if (bufHdr->tag.rnode.dbNode == dbid)
-		{
 	/*
-			 * If there is I/O in progress, better wait till it's done;
+	 * We needn't consider local buffers, since by assumption the target
-			 * don't want to delete the database out from under someone
+	 * database isn't our own.
-			 * who's just trying to flush the buffer!
 	 */
-			if (bufHdr->flags & BM_IO_IN_PROGRESS)
-			{
-				WaitIO(bufHdr);
-				/*
-				 * By now, the buffer very possibly belongs to some other
-				 * DB, so check again before proceeding.
-				 */
-				goto recheck;
-			}
-			/* Now we can do what we came for */
-			bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
-			bufHdr->cntxDirty = false;
-			/*
+	for (i = 0; i < NBuffers; i++)
-			 * The thing should be free, if caller has checked that no
+	{
-			 * backends are running in that database.
+		bufHdr = &BufferDescriptors[i];
-			 */
+		LockBufHdr(bufHdr);
-			Assert(bufHdr->refcount == 0);
+		if (bufHdr->tag.rnode.dbNode == dbid)
+			InvalidateBuffer(bufHdr);		/* releases spinlock */
-			/*
+		else
-			 * And mark the buffer as no longer occupied by this page.
+			UnlockBufHdr(bufHdr);
-			 */
-			StrategyInvalidateBuffer(bufHdr);
-		}
 	}
-	LWLockRelease(BufMgrLock);
 }
 /* -----------------------------------------------------------------
@@ -1342,33 +1434,18 @@ PrintBufferDescs(void)
 	int			i;
 	BufferDesc *buf = BufferDescriptors;
-	if (IsUnderPostmaster)
-	{
-		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 	for (i = 0; i < NBuffers; ++i, ++buf)
 	{
+		/* theoretically we should lock the bufhdr here */
 		elog(LOG,
-				 "[%02d] (freeNext=%d, freePrev=%d, rel=%u/%u/%u, "
+			 "[%02d] (freeNext=%d, rel=%u/%u/%u, "
 			 "blockNum=%u, flags=0x%x, refcount=%u %d)",
-				 i, buf->freeNext, buf->freePrev,
+			 i, buf->freeNext,
 			 buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
 			 buf->tag.rnode.relNode,
 			 buf->tag.blockNum, buf->flags,
 			 buf->refcount, PrivateRefCount[i]);
 	}
-		LWLockRelease(BufMgrLock);
-	}
-	else
-	{
-		/* interactive backend */
-		for (i = 0; i < NBuffers; ++i, ++buf)
-		{
-			printf("[%-2d] (%u/%u/%u, %u) flags=0x%x, refcount=%u %d)\n",
-				   i, buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
-				   buf->tag.rnode.relNode, buf->tag.blockNum,
-				   buf->flags, buf->refcount, PrivateRefCount[i]);
-		}
-	}
 }
 #endif
@@ -1379,20 +1456,21 @@ PrintPinnedBufs(void)
 	int			i;
 	BufferDesc *buf = BufferDescriptors;
-	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 	for (i = 0; i < NBuffers; ++i, ++buf)
 	{
 		if (PrivateRefCount[i] > 0)
-			elog(NOTICE,
+		{
-				 "[%02d] (freeNext=%d, freePrev=%d, rel=%u/%u/%u, "
+			/* theoretically we should lock the bufhdr here */
+			elog(LOG,
+				 "[%02d] (freeNext=%d, rel=%u/%u/%u, "
 				 "blockNum=%u, flags=0x%x, refcount=%u %d)",
-				 i, buf->freeNext, buf->freePrev,
+				 i, buf->freeNext,
 				 buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
 				 buf->tag.rnode.relNode,
 				 buf->tag.blockNum, buf->flags,
 				 buf->refcount, PrivateRefCount[i]);
 		}
-	LWLockRelease(BufMgrLock);
+	}
 }
 #endif
@@ -1451,8 +1529,7 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
 			bufHdr = &LocalBufferDescriptors[i];
 			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
 			{
-				if ((bufHdr->flags & BM_VALID) &&
+				if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
-					(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty))
 				{
 					ErrorContextCallback errcontext;
@@ -1464,11 +1541,10 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
 					smgrwrite(rel->rd_smgr,
 							  bufHdr->tag.blockNum,
-							  (char *) MAKE_PTR(bufHdr->data),
+							  (char *) LocalBufHdrGetBlock(bufHdr),
 							  true);
 					bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
-					bufHdr->cntxDirty = false;
 					/* Pop the error context stack */
 					error_context_stack = errcontext.previous;
@@ -1478,7 +1554,11 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
 						 RelationGetRelationName(rel), firstDelBlock,
 						 bufHdr->tag.blockNum, LocalRefCount[i]);
 				if (bufHdr->tag.blockNum >= firstDelBlock)
-					bufHdr->tag.rnode.relNode = InvalidOid;
+				{
+					CLEAR_BUFFERTAG(bufHdr->tag);
+					bufHdr->flags = 0;
+					bufHdr->usage_count = 0;
+				}
 			}
 		}
@@ -1488,46 +1568,40 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
 	/* Make sure we can handle the pin inside the loop */
 	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
-	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 	for (i = 0; i < NBuffers; i++)
 	{
 		bufHdr = &BufferDescriptors[i];
+	recheck:
+		LockBufHdr(bufHdr);
 		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
 		{
-			if ((bufHdr->flags & BM_VALID) &&
+			if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
-				(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty))
 			{
-				PinBuffer(bufHdr, true);
+				PinBuffer_Locked(bufHdr);
-				/* Someone else might be flushing buffer */
+				LWLockAcquire(bufHdr->content_lock, LW_SHARED);
-				if (bufHdr->flags & BM_IO_IN_PROGRESS)
+				FlushBuffer(bufHdr, rel->rd_smgr);
-					WaitIO(bufHdr);
+				LWLockRelease(bufHdr->content_lock);
-				/* Still dirty? */
+				UnpinBuffer(bufHdr, true, false /* no freelist change */ );
-				if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty)
+				/*
-				{
+				 * As soon as we unpin, it's possible for someone to take
-					StartBufferIO(bufHdr, false);
+				 * the buffer away from us; so loop back to re-lock and
+				 * re-check if it still belongs to the target relation.
-					FlushBuffer(bufHdr, rel->rd_smgr, false);
+				 */
+				goto recheck;
-					TerminateBufferIO(bufHdr, 0);
-				}
-				UnpinBuffer(bufHdr, true);
-				if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty)
-					elog(ERROR, "FlushRelationBuffers(\"%s\", %u): block %u was re-dirtied",
-						 RelationGetRelationName(rel), firstDelBlock,
-						 bufHdr->tag.blockNum);
 			}
-			if (bufHdr->refcount != 0)
+			/*
-				elog(ERROR, "FlushRelationBuffers(\"%s\", %u): block %u is referenced (private %d, global %u)",
+			 * Even though it's not dirty, it could still be pinned because
-					 RelationGetRelationName(rel), firstDelBlock,
+			 * TerminateIO and UnpinBuffer are separate actions.  Hence,
-					 bufHdr->tag.blockNum,
+			 * we can't error out on nonzero reference count here.
-					 PrivateRefCount[i], bufHdr->refcount);
+			 */
 			if (bufHdr->tag.blockNum >= firstDelBlock)
-				StrategyInvalidateBuffer(bufHdr);
+				InvalidateBuffer(bufHdr);		/* releases spinlock */
+			else
+				UnlockBufHdr(bufHdr);
 		}
+		else
+			UnlockBufHdr(bufHdr);
 	}
-	LWLockRelease(BufMgrLock);
 }
 /*
@@ -1547,7 +1621,11 @@ ReleaseBuffer(Buffer buffer)
 	if (BufferIsLocal(buffer))
 	{
 		Assert(LocalRefCount[-buffer - 1] > 0);
+		bufHdr = &LocalBufferDescriptors[-buffer - 1];
 		LocalRefCount[-buffer - 1]--;
+		if (LocalRefCount[-buffer - 1] == 0 &&
+			bufHdr->usage_count < BM_MAX_USAGE_COUNT)
+			bufHdr->usage_count++;
 		return;
 	}
@@ -1558,11 +1636,7 @@ ReleaseBuffer(Buffer buffer)
 	if (PrivateRefCount[buffer - 1] > 1)
 		PrivateRefCount[buffer - 1]--;
 	else
-	{
+		UnpinBuffer(bufHdr, false, true);
-		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-		UnpinBuffer(bufHdr, false);
-		LWLockRelease(BufMgrLock);
-	}
 }
 /*
@@ -1585,88 +1659,6 @@ IncrBufferRefCount(Buffer buffer)
 		PrivateRefCount[buffer - 1]++;
 }
-#ifdef NOT_USED
-void
-IncrBufferRefCount_Debug(char *file, int line, Buffer buffer)
-{
-	IncrBufferRefCount(buffer);
-	if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer))
-	{
-		BufferDesc *buf = &BufferDescriptors[buffer - 1];
-		fprintf(stderr,
-				"PIN(Incr) %d rel = %u/%u/%u, blockNum = %u, "
-				"refcount = %d, file: %s, line: %d\n",
-				buffer,
-				buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
-				buf->tag.rnode.relNode, buf->tag.blockNum,
-				PrivateRefCount[buffer - 1], file, line);
-	}
-}
-#endif
-#ifdef NOT_USED
-void
-ReleaseBuffer_Debug(char *file, int line, Buffer buffer)
-{
-	ReleaseBuffer(buffer);
-	if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer))
-	{
-		BufferDesc *buf = &BufferDescriptors[buffer - 1];
-		fprintf(stderr,
-				"UNPIN(Rel) %d rel = %u/%u/%u, blockNum = %u, "
-				"refcount = %d, file: %s, line: %d\n",
-				buffer,
-				buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
-				buf->tag.rnode.relNode, buf->tag.blockNum,
-				PrivateRefCount[buffer - 1], file, line);
-	}
-}
-#endif
-#ifdef NOT_USED
-Buffer
-ReleaseAndReadBuffer_Debug(char *file,
-						   int line,
-						   Buffer buffer,
-						   Relation relation,
-						   BlockNumber blockNum)
-{
-	bool		bufferValid;
-	Buffer		b;
-	bufferValid = BufferIsValid(buffer);
-	b = ReleaseAndReadBuffer(buffer, relation, blockNum);
-	if (ShowPinTrace && bufferValid && BufferIsLocal(buffer)
-		&& is_userbuffer(buffer))
-	{
-		BufferDesc *buf = &BufferDescriptors[buffer - 1];
-		fprintf(stderr,
-				"UNPIN(Rel&Rd) %d rel = %u/%u/%u, blockNum = %u, "
-				"refcount = %d, file: %s, line: %d\n",
-				buffer,
-				buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
-				buf->tag.rnode.relNode, buf->tag.blockNum,
-				PrivateRefCount[buffer - 1], file, line);
-	}
-	if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer))
-	{
-		BufferDesc *buf = &BufferDescriptors[b - 1];
-		fprintf(stderr,
-				"PIN(Rel&Rd) %d rel = %u/%u/%u, blockNum = %u, "
-				"refcount = %d, file: %s, line: %d\n",
-				b,
-				buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
-				buf->tag.rnode.relNode, buf->tag.blockNum,
-				PrivateRefCount[b - 1], file, line);
-	}
-	return b;
-}
-#endif
 /*
 * SetBufferCommitInfoNeedsSave
 *
@@ -1682,7 +1674,7 @@ ReleaseAndReadBuffer_Debug(char *file,
 * This routine might get called many times on the same page, if we are making
 * the first scan after commit of an xact that added/deleted many tuples.
 * So, be as quick as we can if the buffer is already dirty.  We do this by
- * not acquiring BufMgrLock if it looks like the status bits are already OK.
+ * not acquiring spinlock if it looks like the status bits are already OK.
 * (Note it is okay if someone else clears BM_JUST_DIRTIED immediately after
 * we look, because the buffer content update is already done and will be
 * reflected in the I/O.)
@@ -1703,23 +1695,25 @@ SetBufferCommitInfoNeedsSave(Buffer buffer)
 	bufHdr = &BufferDescriptors[buffer - 1];
+	Assert(PrivateRefCount[buffer - 1] > 0);
 	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
 		(BM_DIRTY | BM_JUST_DIRTIED))
 	{
-		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
+		LockBufHdr(bufHdr);
 		Assert(bufHdr->refcount > 0);
 		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
-		LWLockRelease(BufMgrLock);
+		UnlockBufHdr(bufHdr);
 	}
 }
 /*
- * Release buffer context locks for shared buffers.
+ * Release buffer content locks for shared buffers.
 *
 * Used to clean up after errors.
 *
 * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
- * of releasing buffer context locks per se; the only thing we need to deal
+ * of releasing buffer content locks per se; the only thing we need to deal
 * with here is clearing any PIN_COUNT request that was in progress.
 */
 void
@@ -1731,7 +1725,7 @@ UnlockBuffers(void)
 	{
 		HOLD_INTERRUPTS();		/* don't want to die() partway through... */
-		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
+		LockBufHdr_NoHoldoff(buf);
 		/*
 		 * Don't complain if flag bit not set; it could have been
@@ -1741,18 +1735,19 @@ UnlockBuffers(void)
 		if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
 			buf->wait_backend_id == MyBackendId)
 			buf->flags &= ~BM_PIN_COUNT_WAITER;
-		LWLockRelease(BufMgrLock);
+		UnlockBufHdr_NoHoldoff(buf);
 		ProcCancelWaitForSignal();
+		PinCountWaitBuf = NULL;
 		RESUME_INTERRUPTS();
 	}
-	PinCountWaitBuf = NULL;
 }
 /*
- * Acquire or release the cntx_lock for the buffer.
+ * Acquire or release the content_lock for the buffer.
 */
 void
 LockBuffer(Buffer buffer, int mode)
@@ -1766,27 +1761,29 @@ LockBuffer(Buffer buffer, int mode)
 	buf = &(BufferDescriptors[buffer - 1]);
 	if (mode == BUFFER_LOCK_UNLOCK)
-		LWLockRelease(buf->cntx_lock);
+		LWLockRelease(buf->content_lock);
 	else if (mode == BUFFER_LOCK_SHARE)
-		LWLockAcquire(buf->cntx_lock, LW_SHARED);
+		LWLockAcquire(buf->content_lock, LW_SHARED);
 	else if (mode == BUFFER_LOCK_EXCLUSIVE)
 	{
-		LWLockAcquire(buf->cntx_lock, LW_EXCLUSIVE);
+		LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
 		/*
-		 * This is not the best place to set cntxDirty flag (eg indices do
+		 * This is not the best place to mark buffer dirty (eg indices do
 		 * not always change buffer they lock in excl mode). But please
-		 * remember that it's critical to set cntxDirty *before* logging
+		 * remember that it's critical to set dirty bit *before* logging
-		 * changes with XLogInsert() - see comments in BufferSync().
+		 * changes with XLogInsert() - see comments in SyncOneBuffer().
 		 */
-		buf->cntxDirty = true;
+		LockBufHdr_NoHoldoff(buf);
+		buf->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+		UnlockBufHdr_NoHoldoff(buf);
 	}
 	else
 		elog(ERROR, "unrecognized buffer lock mode: %d", mode);
 }
 /*
- * Acquire the cntx_lock for the buffer, but only if we don't have to wait.
+ * Acquire the content_lock for the buffer, but only if we don't have to wait.
 *
 * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
 */
@@ -1801,15 +1798,17 @@ ConditionalLockBuffer(Buffer buffer)
 	buf = &(BufferDescriptors[buffer - 1]);
-	if (LWLockConditionalAcquire(buf->cntx_lock, LW_EXCLUSIVE))
+	if (LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE))
 	{
 		/*
-		 * This is not the best place to set cntxDirty flag (eg indices do
+		 * This is not the best place to mark buffer dirty (eg indices do
 		 * not always change buffer they lock in excl mode). But please
-		 * remember that it's critical to set cntxDirty *before* logging
+		 * remember that it's critical to set dirty bit *before* logging
-		 * changes with XLogInsert() - see comments in BufferSync().
+		 * changes with XLogInsert() - see comments in SyncOneBuffer().
 		 */
-		buf->cntxDirty = true;
+		LockBufHdr_NoHoldoff(buf);
+		buf->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+		UnlockBufHdr_NoHoldoff(buf);
 		return true;
 	}
@@ -1861,25 +1860,25 @@ LockBufferForCleanup(Buffer buffer)
 	{
 		/* Try to acquire lock */
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
+		LockBufHdr_NoHoldoff(bufHdr);
 		Assert(bufHdr->refcount > 0);
 		if (bufHdr->refcount == 1)
 		{
 			/* Successfully acquired exclusive lock with pincount 1 */
-			LWLockRelease(BufMgrLock);
+			UnlockBufHdr_NoHoldoff(bufHdr);
 			return;
 		}
 		/* Failed, so mark myself as waiting for pincount 1 */
 		if (bufHdr->flags & BM_PIN_COUNT_WAITER)
 		{
-			LWLockRelease(BufMgrLock);
+			UnlockBufHdr_NoHoldoff(bufHdr);
 			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 			elog(ERROR, "multiple backends attempting to wait for pincount 1");
 		}
 		bufHdr->wait_backend_id = MyBackendId;
 		bufHdr->flags |= BM_PIN_COUNT_WAITER;
 		PinCountWaitBuf = bufHdr;
-		LWLockRelease(BufMgrLock);
+		UnlockBufHdr_NoHoldoff(bufHdr);
 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 		/* Wait to be signaled by UnpinBuffer() */
 		ProcWaitForSignal();
@@ -1889,94 +1888,160 @@ LockBufferForCleanup(Buffer buffer)
 }
 /*
- *	Functions for IO error handling
+ *	Functions for buffer I/O handling
 *
- *	Note: We assume that nested buffer IO never occurs.
+ *	Note: We assume that nested buffer I/O never occurs.
 *	i.e at most one io_in_progress lock is held per proc.
+ *
+ *	Also note that these are used only for shared buffers, not local ones.
 */
 /*
- * Function:StartBufferIO
+ * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
+ */
+static void
+WaitIO(BufferDesc *buf)
+{
+	/*
+	 * Changed to wait until there's no IO - Inoue 01/13/2000
+	 *
+	 * Note this is *necessary* because an error abort in the process doing
+	 * I/O could release the io_in_progress_lock prematurely. See
+	 * AbortBufferIO.
+	 */
+	for (;;)
+	{
+		BufFlags	sv_flags;
+		/*
+		 * It may not be necessary to acquire the spinlock to check the
+		 * flag here, but since this test is essential for correctness,
+		 * we'd better play it safe.
+		 */
+		LockBufHdr(buf);
+		sv_flags = buf->flags;
+		UnlockBufHdr(buf);
+		if (!(sv_flags & BM_IO_IN_PROGRESS))
+			break;
+		LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
+		LWLockRelease(buf->io_in_progress_lock);
+	}
+}
+/*
+ * StartBufferIO: begin I/O on this buffer
 *	(Assumptions)
 *	My process is executing no IO
- *	BufMgrLock is held
- *	BM_IO_IN_PROGRESS mask is not set for the buffer
 *	The buffer is Pinned
 *
- * Because BufMgrLock is held, we are already in an interrupt holdoff here,
+ * In some scenarios there are race conditions in which multiple backends
- * and do not need another.
+ * could attempt the same I/O operation concurrently.  If someone else
+ * has already started I/O on this buffer then we will block on the
+ * io_in_progress lock until he's done.
+ *
+ * Input operations are only attempted on buffers that are not BM_VALID,
+ * and output operations only on buffers that are BM_VALID and BM_DIRTY,
+ * so we can always tell if the work is already done.
+ *
+ * Returns TRUE if we successfully marked the buffer as I/O busy,
+ * FALSE if someone else already did the work.
 */
-static void
+static bool
 StartBufferIO(BufferDesc *buf, bool forInput)
 {
 	Assert(!InProgressBuf);
-	Assert(!(buf->flags & BM_IO_IN_PROGRESS));
-	buf->flags |= BM_IO_IN_PROGRESS;
+	for (;;)
+	{
+		/*
+		 * Grab the io_in_progress lock so that other processes can wait for
+		 * me to finish the I/O.
+		 */
 		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
+		/* NoHoldoff is OK since we now have an LWLock */
+		LockBufHdr_NoHoldoff(buf);
+		if (!(buf->flags & BM_IO_IN_PROGRESS))
+			break;
+		/*
+		 * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
+		 * lock isn't held is if the process doing the I/O is recovering from
+		 * an error (see AbortBufferIO).  If that's the case, we must wait for
+		 * him to get unwedged.
+		 */
+		UnlockBufHdr_NoHoldoff(buf);
+		LWLockRelease(buf->io_in_progress_lock);
+		WaitIO(buf);
+	}
+	/* Once we get here, there is definitely no I/O active on this buffer */
+	if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
+	{
+		/* someone else already did the I/O */
+		UnlockBufHdr_NoHoldoff(buf);
+		LWLockRelease(buf->io_in_progress_lock);
+		return false;
+	}
+	buf->flags |= BM_IO_IN_PROGRESS;
+	UnlockBufHdr_NoHoldoff(buf);
 	InProgressBuf = buf;
 	IsForInput = forInput;
+	return true;
 }
 /*
- * Function:TerminateBufferIO
+ * TerminateBufferIO: release a buffer we were doing I/O on
 *	(Assumptions)
 *	My process is executing IO for the buffer
- *	BufMgrLock is held
+ *	BM_IO_IN_PROGRESS bit is set for the buffer
- *	BM_IO_IN_PROGRESS mask is set for the buffer
+ *	We hold the buffer's io_in_progress lock
 *	The buffer is Pinned
 *
- * err_flag must be 0 for successful completion and BM_IO_ERROR for failure.
+ * If clear_dirty is TRUE and BM_JUST_DIRTIED is not set, we clear the
+ * buffer's BM_DIRTY flag.  This is appropriate when terminating a
+ * successful write.  The check on BM_JUST_DIRTIED is necessary to avoid
+ * marking the buffer clean if it was re-dirtied while we were writing.
 *
- * Because BufMgrLock is held, we are already in an interrupt holdoff here,
+ * set_flag_bits gets ORed into the buffer's flags.  It must include
- * and do not need another.
+ * BM_IO_ERROR in a failure case.  For successful completion it could
+ * be 0, or BM_VALID if we just finished reading in the page.
 */
 static void
-TerminateBufferIO(BufferDesc *buf, int err_flag)
+TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
 {
 	Assert(buf == InProgressBuf);
+	/* NoHoldoff is OK since we must have an LWLock */
+	LockBufHdr_NoHoldoff(buf);
 	Assert(buf->flags & BM_IO_IN_PROGRESS);
 	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
-	buf->flags |= err_flag;
+	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
+		buf->flags &= ~BM_DIRTY;
+	buf->flags |= set_flag_bits;
-	LWLockRelease(buf->io_in_progress_lock);
+	UnlockBufHdr_NoHoldoff(buf);
 	InProgressBuf = NULL;
-}
-/*
+	LWLockRelease(buf->io_in_progress_lock);
- * Function:ContinueBufferIO
- *	(Assumptions)
- *	My process is executing IO for the buffer
- *	BufMgrLock is held
- *	The buffer is Pinned
- *
- * Because BufMgrLock is held, we are already in an interrupt holdoff here,
- * and do not need another.
- */
-static void
-ContinueBufferIO(BufferDesc *buf, bool forInput)
-{
-	Assert(buf == InProgressBuf);
-	Assert(buf->flags & BM_IO_IN_PROGRESS);
-	IsForInput = forInput;
-}
-#ifdef NOT_USED
-void
-InitBufferIO(void)
-{
-	InProgressBuf = NULL;
 }
-#endif
 /*
- *	Clean up any active buffer I/O after an error.
+ * AbortBufferIO: Clean up any active buffer I/O after an error.
- *	BufMgrLock isn't held when this function is called,
+ *
+ *	All LWLocks we might have held have been released,
 *	but we haven't yet released buffer pins, so the buffer is still pinned.
 *
- *	If I/O was in progress, we always set BM_IO_ERROR.
+ *	If I/O was in progress, we always set BM_IO_ERROR, even though it's
+ *	possible the error condition wasn't related to the I/O.
 */
 void
 AbortBufferIO(void)
@@ -1994,20 +2059,27 @@ AbortBufferIO(void)
 		 */
 		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
-		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
+		/* NoHoldoff is OK since we now have an LWLock */
+		LockBufHdr_NoHoldoff(buf);
 		Assert(buf->flags & BM_IO_IN_PROGRESS);
 		if (IsForInput)
 		{
-			Assert(!(buf->flags & BM_DIRTY || buf->cntxDirty));
+			Assert(!(buf->flags & BM_DIRTY));
 			/* We'd better not think buffer is valid yet */
 			Assert(!(buf->flags & BM_VALID));
+			UnlockBufHdr_NoHoldoff(buf);
 		}
 		else
 		{
-			Assert(buf->flags & BM_DIRTY || buf->cntxDirty);
+			BufFlags	sv_flags;
+			sv_flags = buf->flags;
+			Assert(sv_flags & BM_DIRTY);
+			UnlockBufHdr_NoHoldoff(buf);
 			/* Issue notice if this is not the first failure... */
-			if (buf->flags & BM_IO_ERROR)
+			if (sv_flags & BM_IO_ERROR)
 			{
+				/* Buffer is pinned, so we can read tag without spinlock */
 				ereport(WARNING,
 						(errcode(ERRCODE_IO_ERROR),
 						 errmsg("could not write block %u of %u/%u/%u",
@@ -2017,10 +2089,8 @@ AbortBufferIO(void)
 								buf->tag.rnode.relNode),
 						 errdetail("Multiple failures --- write error may be permanent.")));
 			}
-			buf->flags |= BM_DIRTY;
 		}
-		TerminateBufferIO(buf, BM_IO_ERROR);
+		TerminateBufferIO(buf, false, BM_IO_ERROR);
-		LWLockRelease(BufMgrLock);
 	}
 }
@@ -2032,6 +2102,7 @@ buffer_write_error_callback(void *arg)
 {
 	BufferDesc *bufHdr = (BufferDesc *) arg;
+	/* Buffer is pinned, so we can read the tag without locking the spinlock */
 	if (bufHdr != NULL)
 		errcontext("writing block %u of relation %u/%u/%u",
 				   bufHdr->tag.blockNum,

--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
 /*-------------------------------------------------------------------------
 *
 * freelist.c
- *	  routines for manipulating the buffer pool's replacement strategy.
+ *	  routines for managing the buffer pool's replacement strategy.
- *
- * The name "freelist.c" is now a bit of a misnomer, since this module
- * controls not only the list of free buffers per se, but the entire
- * mechanism for looking up existing shared buffers and the strategy
- * for choosing replacement victims when needed.
- *
- * Note: all routines in this file assume that the BufMgrLock is held
- * by the caller, so no synchronization is needed.
 *
 *
 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
@@ -17,386 +9,38 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/freelist.c,v 1.50 2005/02/03 23:29:11 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/freelist.c,v 1.51 2005/03/04 20:21:06 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"
-#include <time.h>
-#include "access/xact.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 /*
- * Definitions for the buffer replacement strategy
+ * The shared freelist control information.
- */
-#define STRAT_LIST_UNUSED	(-1)
-#define STRAT_LIST_B1		0
-#define STRAT_LIST_T1		1
-#define STRAT_LIST_T2		2
-#define STRAT_LIST_B2		3
-#define STRAT_NUM_LISTS		4
-/*
- * The Cache Directory Block (CDB) of the Adaptive Replacement Cache (ARC)
 */
 typedef struct
 {
-	int			prev;			/* list links */
+	/* Clock sweep hand: index of next buffer to consider grabbing */
-	int			next;
+	int			nextVictimBuffer;
-	short		list;			/* ID of list it is currently in */
-	bool		t1_vacuum;		/* t => present only because of VACUUM */
-	TransactionId t1_xid;		/* the xid this entry went onto T1 */
-	BufferTag	buf_tag;		/* page identifier */
-	int			buf_id;			/* currently assigned data buffer, or -1 */
-} BufferStrategyCDB;
-/*
+	int			firstFreeBuffer;	/* Head of list of unused buffers */
- * The shared ARC control information.
+	int			lastFreeBuffer;		/* Tail of list of unused buffers */
+	/*
+	 * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1
+	 * (that is, when the list is empty)
 	 */
-typedef struct
-{
-	int			target_T1_size; /* What T1 size are we aiming for */
-	int			listUnusedCDB;	/* All unused StrategyCDB */
-	int			listHead[STRAT_NUM_LISTS];		/* ARC lists B1, T1, T2
-												 * and B2 */
-	int			listTail[STRAT_NUM_LISTS];
-	int			listSize[STRAT_NUM_LISTS];
-	Buffer		listFreeBuffers;	/* List of unused buffers */
-	long		num_lookup;		/* Some hit statistics */
-	long		num_hit[STRAT_NUM_LISTS];
-	time_t		stat_report;
-	/* Array of CDB's starts here */
-	BufferStrategyCDB cdb[1];	/* VARIABLE SIZE ARRAY */
 } BufferStrategyControl;
-/* GUC variable: time in seconds between statistics reports */
-int			DebugSharedBuffers = 0;
 /* Pointers to shared state */
 static BufferStrategyControl *StrategyControl = NULL;
-static BufferStrategyCDB *StrategyCDB = NULL;
 /* Backend-local state about whether currently vacuuming */
-static bool strategy_hint_vacuum = false;
+bool		strategy_hint_vacuum = false;
-static TransactionId strategy_vacuum_xid;
-#define T1_TARGET	(StrategyControl->target_T1_size)
-#define B1_LENGTH	(StrategyControl->listSize[STRAT_LIST_B1])
-#define T1_LENGTH	(StrategyControl->listSize[STRAT_LIST_T1])
-#define T2_LENGTH	(StrategyControl->listSize[STRAT_LIST_T2])
-#define B2_LENGTH	(StrategyControl->listSize[STRAT_LIST_B2])
-/*
- * Macro to remove a CDB from whichever list it currently is on
- */
-#define STRAT_LIST_REMOVE(cdb) \
-do { \
-	Assert((cdb)->list >= 0 && (cdb)->list < STRAT_NUM_LISTS);	\
-	if ((cdb)->prev < 0)										\
-		StrategyControl->listHead[(cdb)->list] = (cdb)->next;	\
-	else														\
-		StrategyCDB[(cdb)->prev].next = (cdb)->next;			\
-	if ((cdb)->next < 0)										\
-		StrategyControl->listTail[(cdb)->list] = (cdb)->prev;	\
-	else														\
-		StrategyCDB[(cdb)->next].prev = (cdb)->prev;			\
-	StrategyControl->listSize[(cdb)->list]--;					\
-	(cdb)->list = STRAT_LIST_UNUSED;							\
-} while(0)
-/*
- * Macro to add a CDB to the tail of a list (MRU position)
- */
-#define STRAT_MRU_INSERT(cdb,l) \
-do { \
-	Assert((cdb)->list == STRAT_LIST_UNUSED);					\
-	if (StrategyControl->listTail[(l)] < 0)						\
-	{															\
-		(cdb)->prev = (cdb)->next = -1;							\
-		StrategyControl->listHead[(l)] =						\
-			StrategyControl->listTail[(l)] =					\
-			((cdb) - StrategyCDB);								\
-	}															\
-	else														\
-	{															\
-		(cdb)->next = -1;										\
-		(cdb)->prev = StrategyControl->listTail[(l)];			\
-		StrategyCDB[StrategyControl->listTail[(l)]].next =		\
-			((cdb) - StrategyCDB);								\
-		StrategyControl->listTail[(l)] =						\
-			((cdb) - StrategyCDB);								\
-	}															\
-	StrategyControl->listSize[(l)]++;							\
-	(cdb)->list = (l);											\
-} while(0)
-/*
- * Macro to add a CDB to the head of a list (LRU position)
- */
-#define STRAT_LRU_INSERT(cdb,l) \
-do { \
-	Assert((cdb)->list == STRAT_LIST_UNUSED);					\
-	if (StrategyControl->listHead[(l)] < 0)						\
-	{															\
-		(cdb)->prev = (cdb)->next = -1;							\
-		StrategyControl->listHead[(l)] =						\
-			StrategyControl->listTail[(l)] =					\
-			((cdb) - StrategyCDB);								\
-	}															\
-	else														\
-	{															\
-		(cdb)->prev = -1;										\
-		(cdb)->next = StrategyControl->listHead[(l)];			\
-		StrategyCDB[StrategyControl->listHead[(l)]].prev =		\
-			((cdb) - StrategyCDB);								\
-		StrategyControl->listHead[(l)] =						\
-			((cdb) - StrategyCDB);								\
-	}															\
-	StrategyControl->listSize[(l)]++;							\
-	(cdb)->list = (l);											\
-} while(0)
-/*
- * Printout for use when DebugSharedBuffers is enabled
- */
-static void
-StrategyStatsDump(void)
-{
-	time_t		now = time(NULL);
-	if (StrategyControl->stat_report + DebugSharedBuffers < now)
-	{
-		long		all_hit,
-					b1_hit,
-					t1_hit,
-					t2_hit,
-					b2_hit;
-		int			id,
-					t1_clean,
-					t2_clean;
-		ErrorContextCallback *errcxtold;
-		id = StrategyControl->listHead[STRAT_LIST_T1];
-		t1_clean = 0;
-		while (id >= 0)
-		{
-			if (BufferDescriptors[StrategyCDB[id].buf_id].flags & BM_DIRTY)
-				break;
-			t1_clean++;
-			id = StrategyCDB[id].next;
-		}
-		id = StrategyControl->listHead[STRAT_LIST_T2];
-		t2_clean = 0;
-		while (id >= 0)
-		{
-			if (BufferDescriptors[StrategyCDB[id].buf_id].flags & BM_DIRTY)
-				break;
-			t2_clean++;
-			id = StrategyCDB[id].next;
-		}
-		if (StrategyControl->num_lookup == 0)
-			all_hit = b1_hit = t1_hit = t2_hit = b2_hit = 0;
-		else
-		{
-			b1_hit = (StrategyControl->num_hit[STRAT_LIST_B1] * 100 /
-					  StrategyControl->num_lookup);
-			t1_hit = (StrategyControl->num_hit[STRAT_LIST_T1] * 100 /
-					  StrategyControl->num_lookup);
-			t2_hit = (StrategyControl->num_hit[STRAT_LIST_T2] * 100 /
-					  StrategyControl->num_lookup);
-			b2_hit = (StrategyControl->num_hit[STRAT_LIST_B2] * 100 /
-					  StrategyControl->num_lookup);
-			all_hit = b1_hit + t1_hit + t2_hit + b2_hit;
-		}
-		errcxtold = error_context_stack;
-		error_context_stack = NULL;
-		elog(DEBUG1, "ARC T1target=%5d B1len=%5d T1len=%5d T2len=%5d B2len=%5d",
-			 T1_TARGET, B1_LENGTH, T1_LENGTH, T2_LENGTH, B2_LENGTH);
-		elog(DEBUG1, "ARC total   =%4ld%% B1hit=%4ld%% T1hit=%4ld%% T2hit=%4ld%% B2hit=%4ld%%",
-			 all_hit, b1_hit, t1_hit, t2_hit, b2_hit);
-		elog(DEBUG1, "ARC clean buffers at LRU       T1=   %5d T2=   %5d",
-			 t1_clean, t2_clean);
-		error_context_stack = errcxtold;
-		StrategyControl->num_lookup = 0;
-		StrategyControl->num_hit[STRAT_LIST_B1] = 0;
-		StrategyControl->num_hit[STRAT_LIST_T1] = 0;
-		StrategyControl->num_hit[STRAT_LIST_T2] = 0;
-		StrategyControl->num_hit[STRAT_LIST_B2] = 0;
-		StrategyControl->stat_report = now;
-	}
-}
-/*
- * StrategyBufferLookup
- *
- *	Lookup a page request in the cache directory. A buffer is only
- *	returned for a T1 or T2 cache hit. B1 and B2 hits are just
- *	remembered here, to possibly affect the behaviour later.
- *
- *	recheck indicates we are rechecking after I/O wait; do not change
- *	internal status in this case.
- *
- *	*cdb_found_index is set to the index of the found CDB, or -1 if none.
- *	This is not intended to be used by the caller, except to pass to
- *	StrategyReplaceBuffer().
- */
-BufferDesc *
-StrategyBufferLookup(BufferTag *tagPtr, bool recheck,
-					 int *cdb_found_index)
-{
-	BufferStrategyCDB *cdb;
-	/* Optional stats printout */
-	if (DebugSharedBuffers > 0)
-		StrategyStatsDump();
-	/*
-	 * Count lookups
-	 */
-	StrategyControl->num_lookup++;
-	/*
-	 * Lookup the block in the shared hash table
-	 */
-	*cdb_found_index = BufTableLookup(tagPtr);
-	/*
-	 * Done if complete CDB lookup miss
-	 */
-	if (*cdb_found_index < 0)
-		return NULL;
-	/*
-	 * We found a CDB
-	 */
-	cdb = &StrategyCDB[*cdb_found_index];
-	/*
-	 * Count hits
-	 */
-	StrategyControl->num_hit[cdb->list]++;
-	/*
-	 * If this is a T2 hit, we simply move the CDB to the T2 MRU position
-	 * and return the found buffer.
-	 *
-	 * A CDB in T2 cannot have t1_vacuum set, so we needn't check.  However,
-	 * if the current process is VACUUM then it doesn't promote to MRU.
-	 */
-	if (cdb->list == STRAT_LIST_T2)
-	{
-		if (!strategy_hint_vacuum)
-		{
-			STRAT_LIST_REMOVE(cdb);
-			STRAT_MRU_INSERT(cdb, STRAT_LIST_T2);
-		}
-		return &BufferDescriptors[cdb->buf_id];
-	}
-	/*
-	 * If this is a T1 hit, we move the buffer to the T2 MRU only if
-	 * another transaction had read it into T1, *and* neither transaction
-	 * is a VACUUM. This is required because any UPDATE or DELETE in
-	 * PostgreSQL does multiple ReadBuffer(), first during the scan, later
-	 * during the heap_update() or heap_delete().  Otherwise move to T1
-	 * MRU.  VACUUM doesn't even get to make that happen.
-	 */
-	if (cdb->list == STRAT_LIST_T1)
-	{
-		if (!strategy_hint_vacuum)
-		{
-			if (!cdb->t1_vacuum &&
-				!TransactionIdEquals(cdb->t1_xid, GetTopTransactionId()))
-			{
-				STRAT_LIST_REMOVE(cdb);
-				STRAT_MRU_INSERT(cdb, STRAT_LIST_T2);
-			}
-			else
-			{
-				STRAT_LIST_REMOVE(cdb);
-				STRAT_MRU_INSERT(cdb, STRAT_LIST_T1);
-				/*
-				 * If a non-VACUUM process references a page recently
-				 * loaded by VACUUM, clear the stigma; the state will now
-				 * be the same as if this process loaded it originally.
-				 */
-				if (cdb->t1_vacuum)
-				{
-					cdb->t1_xid = GetTopTransactionId();
-					cdb->t1_vacuum = false;
-				}
-			}
-		}
-		return &BufferDescriptors[cdb->buf_id];
-	}
-	/*
-	 * In the case of a recheck we don't care about B1 or B2 hits here.
-	 * The bufmgr does this call only to make sure no-one faulted in the
-	 * block while we where busy flushing another; we don't want to doubly
-	 * adjust the T1target.
-	 *
-	 * Now for this really to end up as a B1 or B2 cache hit, we must have
-	 * been flushing for quite some time as the block not only must have
-	 * been read, but also traveled through the queue and evicted from the
-	 * T cache again already.
-	 *
-	 * VACUUM re-reads shouldn't adjust the target either.
-	 */
-	if (recheck || strategy_hint_vacuum)
-		return NULL;
-	/*
-	 * Adjust the target size of the T1 cache depending on if this is a B1
-	 * or B2 hit.
-	 */
-	switch (cdb->list)
-	{
-		case STRAT_LIST_B1:
-			/*
-			 * B1 hit means that the T1 cache is probably too small.
-			 * Adjust the T1 target size and continue below.
-			 */
-			T1_TARGET = Min(T1_TARGET + Max(B2_LENGTH / B1_LENGTH, 1),
-							NBuffers);
-			break;
-		case STRAT_LIST_B2:
-			/*
-			 * B2 hit means that the T2 cache is probably too small.
-			 * Adjust the T1 target size and continue below.
-			 */
-			T1_TARGET = Max(T1_TARGET - Max(B1_LENGTH / B2_LENGTH, 1), 0);
-			break;
-		default:
-			elog(ERROR, "buffer hash table corrupted: CDB->list = %d",
-				 cdb->list);
-	}
-	/*
-	 * Even though we had seen the block in the past, its data is not
-	 * currently in memory ... cache miss to the bufmgr.
-	 */
-	return NULL;
-}
 /*
@@ -404,371 +48,146 @@ StrategyBufferLookup(BufferTag *tagPtr, bool recheck,
 *
 *	Called by the bufmgr to get the next candidate buffer to use in
 *	BufferAlloc(). The only hard requirement BufferAlloc() has is that
- *	this buffer must not currently be pinned.
+ *	the selected buffer must not currently be pinned by anyone.
 *
- *	*cdb_replace_index is set to the index of the candidate CDB, or -1 if
+ *	To ensure that no one else can pin the buffer before we do, we must
- *	none (meaning we are using a previously free buffer).  This is not
+ *	return the buffer with the buffer header spinlock still held.  That
- *	intended to be used by the caller, except to pass to
+ *	means that we return with the BufFreelistLock still held, as well;
- *	StrategyReplaceBuffer().
+ *	the caller must release that lock once the spinlock is dropped.
 */
 BufferDesc *
-StrategyGetBuffer(int *cdb_replace_index)
+StrategyGetBuffer(void)
 {
-	int			cdb_id;
 	BufferDesc *buf;
+	int			trycounter;
+	LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
-	if (StrategyControl->listFreeBuffers < 0)
-	{
-		/*
-		 * We don't have a free buffer, must take one from T1 or T2.
-		 * Choose based on trying to converge T1len to T1target.
-		 */
-		if (T1_LENGTH >= Max(1, T1_TARGET))
-		{
 	/*
-			 * We should take the first unpinned buffer from T1.
+	 * Try to get a buffer from the freelist.  Note that the freeNext fields
+	 * are considered to be protected by the BufFreelistLock not the
+	 * individual buffer spinlocks, so it's OK to manipulate them without
+	 * holding the spinlock.
 	 */
-			cdb_id = StrategyControl->listHead[STRAT_LIST_T1];
+	while (StrategyControl->firstFreeBuffer >= 0)
-			while (cdb_id >= 0)
 	{
-				buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id];
+		buf = &BufferDescriptors[StrategyControl->firstFreeBuffer];
-				if (buf->refcount == 0)
+		Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
-				{
-					*cdb_replace_index = cdb_id;
+		/* Unconditionally remove buffer from freelist */
-					Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T1);
+		StrategyControl->firstFreeBuffer = buf->freeNext;
-					return buf;
+		buf->freeNext = FREENEXT_NOT_IN_LIST;
-				}
-				cdb_id = StrategyCDB[cdb_id].next;
-			}
 		/*
-			 * No unpinned T1 buffer found - try T2 cache.
+		 * If the buffer is pinned or has a nonzero usage_count,
+		 * we cannot use it; discard it and retry.  (This can only happen
+		 * if VACUUM put a valid buffer in the freelist and then someone
+		 * else used it before we got to it.)
 		 */
-			cdb_id = StrategyControl->listHead[STRAT_LIST_T2];
+		LockBufHdr(buf);
-			while (cdb_id >= 0)
+		if (buf->refcount == 0 && buf->usage_count == 0)
-			{
-				buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id];
-				if (buf->refcount == 0)
-				{
-					*cdb_replace_index = cdb_id;
-					Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T2);
 			return buf;
-				}
+		UnlockBufHdr(buf);
-				cdb_id = StrategyCDB[cdb_id].next;
 	}
-			/*
+	/* Nothing on the freelist, so run the "clock sweep" algorithm */
-			 * No unpinned buffers at all!!!
+	trycounter = NBuffers;
-			 */
+	for (;;)
-			elog(ERROR, "no unpinned buffers available");
-		}
-		else
-		{
-			/*
-			 * We should take the first unpinned buffer from T2.
-			 */
-			cdb_id = StrategyControl->listHead[STRAT_LIST_T2];
-			while (cdb_id >= 0)
-			{
-				buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id];
-				if (buf->refcount == 0)
 	{
-					*cdb_replace_index = cdb_id;
+		buf = &BufferDescriptors[StrategyControl->nextVictimBuffer];
-					Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T2);
-					return buf;
+		if (++StrategyControl->nextVictimBuffer >= NBuffers)
-				}
+			StrategyControl->nextVictimBuffer = 0;
-				cdb_id = StrategyCDB[cdb_id].next;
-			}
 		/*
-			 * No unpinned T2 buffer found - try T1 cache.
+		 * If the buffer is pinned or has a nonzero usage_count,
+		 * we cannot use it; decrement the usage_count and keep scanning.
 		 */
-			cdb_id = StrategyControl->listHead[STRAT_LIST_T1];
+		LockBufHdr(buf);
-			while (cdb_id >= 0)
+		if (buf->refcount == 0 && buf->usage_count == 0)
-			{
-				buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id];
-				if (buf->refcount == 0)
-				{
-					*cdb_replace_index = cdb_id;
-					Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T1);
 			return buf;
+		if (buf->usage_count > 0)
+		{
+			buf->usage_count--;
+			trycounter = NBuffers;
 		}
-				cdb_id = StrategyCDB[cdb_id].next;
+		else if (--trycounter == 0)
-			}
+		{
 			/*
-			 * No unpinned buffers at all!!!
+			 * We've scanned all the buffers without making any state
+			 * changes, so all the buffers are pinned (or were when we
+			 * looked at them).  We could hope that someone will free
+			 * one eventually, but it's probably better to fail than to
+			 * risk getting stuck in an infinite loop.
 			 */
+			UnlockBufHdr(buf);
 			elog(ERROR, "no unpinned buffers available");
 		}
-	}
+		UnlockBufHdr(buf);
-	else
-	{
-		/* There is a completely free buffer available - take it */
-		/*
-		 * Note: This code uses the side effect that a free buffer can
-		 * never be pinned or dirty and therefore the call to
-		 * StrategyReplaceBuffer() will happen without the bufmgr
-		 * releasing the bufmgr-lock in the meantime. That means, that
-		 * there will never be any reason to recheck. Otherwise we would
-		 * leak shared buffers here!
-		 */
-		*cdb_replace_index = -1;
-		buf = &BufferDescriptors[StrategyControl->listFreeBuffers];
-		StrategyControl->listFreeBuffers = buf->bufNext;
-		buf->bufNext = -1;
-		/* Buffer in freelist cannot be pinned */
-		Assert(buf->refcount == 0);
-		Assert(!(buf->flags & BM_DIRTY));
-		return buf;
 	}
 	/* not reached */
 	return NULL;
 }
 /*
- * StrategyReplaceBuffer
+ * StrategyFreeBuffer: put a buffer on the freelist
- *
- *	Called by the buffer manager to inform us that he flushed a buffer
- *	and is now about to replace the content. Prior to this call,
- *	the cache algorithm still reports the buffer as in the cache. After
- *	this call we report the new block, even if IO might still need to
- *	be done to bring in the new content.
 *
- *	cdb_found_index and cdb_replace_index must be the auxiliary values
+ * The buffer is added either at the head or the tail, according to the
- *	returned by previous calls to StrategyBufferLookup and StrategyGetBuffer.
+ * at_head parameter.  This allows a small amount of control over how
+ * quickly the buffer is reused.
 */
 void
-StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
+StrategyFreeBuffer(BufferDesc *buf, bool at_head)
-					  int cdb_found_index, int cdb_replace_index)
 {
-	BufferStrategyCDB *cdb_found;
+	LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
-	BufferStrategyCDB *cdb_replace;
-	if (cdb_found_index >= 0)
-	{
-		/* This must have been a ghost buffer cache hit (B1 or B2) */
-		cdb_found = &StrategyCDB[cdb_found_index];
-		/* Assert that the buffer remembered in cdb_found is the one */
-		/* the buffer manager is currently faulting in */
-		Assert(BUFFERTAGS_EQUAL(cdb_found->buf_tag, *newTag));
-		if (cdb_replace_index >= 0)
-		{
-			/* We are satisfying it with an evicted T buffer */
-			cdb_replace = &StrategyCDB[cdb_replace_index];
-			/* Assert that the buffer remembered in cdb_replace is */
-			/* the one the buffer manager has just evicted */
-			Assert(cdb_replace->list == STRAT_LIST_T1 ||
-				   cdb_replace->list == STRAT_LIST_T2);
-			Assert(cdb_replace->buf_id == buf->buf_id);
-			Assert(BUFFERTAGS_EQUAL(cdb_replace->buf_tag, buf->tag));
 	/*
-			 * Under normal circumstances we move the evicted T list entry
+	 * It is possible that we are told to put something in the freelist
-			 * to the corresponding B list.  However, T1 entries that
+	 * that is already in it; don't screw up the list if so.
-			 * exist only because of VACUUM are just thrown into the
-			 * unused list instead. We don't expect them to be touched
-			 * again by the VACUUM, and if we put them into B1 then VACUUM
-			 * would skew T1_target adjusting.
 	 */
-			if (cdb_replace->t1_vacuum)
+	if (buf->freeNext == FREENEXT_NOT_IN_LIST)
-			{
-				BufTableDelete(&(cdb_replace->buf_tag));
-				STRAT_LIST_REMOVE(cdb_replace);
-				cdb_replace->next = StrategyControl->listUnusedCDB;
-				StrategyControl->listUnusedCDB = cdb_replace_index;
-			}
-			else
 	{
-				if (cdb_replace->list == STRAT_LIST_T1)
+		if (at_head)
 		{
-					STRAT_LIST_REMOVE(cdb_replace);
+			buf->freeNext = StrategyControl->firstFreeBuffer;
-					STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B1);
+			if (buf->freeNext < 0)
+				StrategyControl->lastFreeBuffer = buf->buf_id;
+			StrategyControl->firstFreeBuffer = buf->buf_id;
 		}
 		else
 		{
-					STRAT_LIST_REMOVE(cdb_replace);
+			buf->freeNext = FREENEXT_END_OF_LIST;
-					STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B2);
+			if (StrategyControl->firstFreeBuffer < 0)
-				}
+				StrategyControl->firstFreeBuffer = buf->buf_id;
-			}
-			/* And clear its block reference */
-			cdb_replace->buf_id = -1;
-		}
 			else
-		{
+				BufferDescriptors[StrategyControl->lastFreeBuffer].freeNext = buf->buf_id;
-			/* We are satisfying it with an unused buffer */
+			StrategyControl->lastFreeBuffer = buf->buf_id;
 		}
-		/* Now the found B CDB gets the buffer and is moved to T2 */
-		cdb_found->buf_id = buf->buf_id;
-		STRAT_LIST_REMOVE(cdb_found);
-		STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T2);
 	}
-	else
-	{
-		/*
-		 * This was a complete cache miss, so we need to create a new CDB.
-		 * The goal is to keep T1len+B1len <= c.
-		 */
-		if (B1_LENGTH > 0 && (T1_LENGTH + B1_LENGTH) >= NBuffers)
-		{
-			/* So if B1 isn't empty and T1len+B1len >= c we take B1-LRU */
-			cdb_found = &StrategyCDB[StrategyControl->listHead[STRAT_LIST_B1]];
-			BufTableDelete(&(cdb_found->buf_tag));
-			STRAT_LIST_REMOVE(cdb_found);
-		}
-		else
-		{
-			/* Otherwise, we try to use a free one */
-			if (StrategyControl->listUnusedCDB >= 0)
-			{
-				cdb_found = &StrategyCDB[StrategyControl->listUnusedCDB];
-				StrategyControl->listUnusedCDB = cdb_found->next;
-			}
-			else
-			{
-				/* If there isn't, we take B2-LRU ... except if */
-				/* T1len+B1len+T2len = c ... oh my */
-				if (B2_LENGTH > 0)
-					cdb_found = &StrategyCDB[StrategyControl->listHead[STRAT_LIST_B2]];
-				else
-					cdb_found = &StrategyCDB[StrategyControl->listHead[STRAT_LIST_B1]];
-				BufTableDelete(&(cdb_found->buf_tag));
+	LWLockRelease(BufFreelistLock);
-				STRAT_LIST_REMOVE(cdb_found);
-			}
-		}
-		/* Set the CDB's buf_tag and insert it into the hash table */
-		cdb_found->buf_tag = *newTag;
-		BufTableInsert(&(cdb_found->buf_tag), (cdb_found - StrategyCDB));
-		if (cdb_replace_index >= 0)
-		{
-			/*
-			 * The buffer was formerly in a T list, move its CDB to the
-			 * corresponding B list
-			 */
-			cdb_replace = &StrategyCDB[cdb_replace_index];
-			Assert(cdb_replace->list == STRAT_LIST_T1 ||
-				   cdb_replace->list == STRAT_LIST_T2);
-			Assert(cdb_replace->buf_id == buf->buf_id);
-			Assert(BUFFERTAGS_EQUAL(cdb_replace->buf_tag, buf->tag));
-			if (cdb_replace->list == STRAT_LIST_T1)
-			{
-				STRAT_LIST_REMOVE(cdb_replace);
-				STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B1);
-			}
-			else
-			{
-				STRAT_LIST_REMOVE(cdb_replace);
-				STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B2);
-			}
-			/* And clear its block reference */
-			cdb_replace->buf_id = -1;
-		}
-		else
-		{
-			/* We are satisfying it with an unused buffer */
-		}
-		/* Assign the buffer id to the new CDB */
-		cdb_found->buf_id = buf->buf_id;
-		/*
-		 * Specialized VACUUM optimization. If this complete cache miss
-		 * happened because vacuum needed the page, we place it at the LRU
-		 * position of T1; normally it goes at the MRU position.
-		 */
-		if (strategy_hint_vacuum)
-		{
-			if (TransactionIdEquals(strategy_vacuum_xid,
-									GetTopTransactionId()))
-				STRAT_LRU_INSERT(cdb_found, STRAT_LIST_T1);
-			else
-			{
-				/* VACUUM must have been aborted by error, reset flag */
-				strategy_hint_vacuum = false;
-				STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T1);
-			}
-		}
-		else
-			STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T1);
-		/*
-		 * Remember the Xid when this buffer went onto T1 to avoid a
-		 * single UPDATE promoting a newcomer straight into T2. Also
-		 * remember if it was loaded for VACUUM.
-		 */
-		cdb_found->t1_xid = GetTopTransactionId();
-		cdb_found->t1_vacuum = strategy_hint_vacuum;
-	}
 }
 /*
- * StrategyInvalidateBuffer
+ * StrategySyncStart -- tell BufferSync where to start syncing
 *
- *	Called by the buffer manager to inform us that a buffer content
+ * The result is the buffer index of the best buffer to sync first.
- *	is no longer valid. We simply throw away any eventual existing
+ * BufferSync() will proceed circularly around the buffer array from there.
- *	buffer hash entry and move the CDB and buffer to the free lists.
 */
-void
+int
-StrategyInvalidateBuffer(BufferDesc *buf)
+StrategySyncStart(void)
 {
-	int			cdb_id;
+	int			result;
-	BufferStrategyCDB *cdb;
-	/* The buffer cannot be dirty or pinned */
-	Assert(!(buf->flags & BM_DIRTY) || !(buf->flags & BM_VALID));
-	Assert(buf->refcount == 0);
 	/*
-	 * Lookup the cache directory block for this buffer
+	 * We could probably dispense with the locking here, but just to be
+	 * safe ...
 	 */
-	cdb_id = BufTableLookup(&(buf->tag));
+	LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
-	if (cdb_id < 0)
+	result = StrategyControl->nextVictimBuffer;
-		elog(ERROR, "buffer %d not in buffer hash table", buf->buf_id);
+	LWLockRelease(BufFreelistLock);
-	cdb = &StrategyCDB[cdb_id];
+	return result;
-	/*
-	 * Remove the CDB from the hashtable and the ARC queue it is currently
-	 * on.
-	 */
-	BufTableDelete(&(cdb->buf_tag));
-	STRAT_LIST_REMOVE(cdb);
-	/*
-	 * Clear out the CDB's buffer tag and association with the buffer and
-	 * add it to the list of unused CDB's
-	 */
-	CLEAR_BUFFERTAG(cdb->buf_tag);
-	cdb->buf_id = -1;
-	cdb->next = StrategyControl->listUnusedCDB;
-	StrategyControl->listUnusedCDB = cdb_id;
-	/*
-	 * Clear out the buffer's tag and add it to the list of currently
-	 * unused buffers.	We must do this to ensure that linear scans of the
-	 * buffer array don't think the buffer is valid.
-	 */
-	CLEAR_BUFFERTAG(buf->tag);
-	buf->flags &= ~(BM_VALID | BM_DIRTY);
-	buf->cntxDirty = false;
-	buf->bufNext = StrategyControl->listFreeBuffers;
-	StrategyControl->listFreeBuffers = buf->buf_id;
 }
 /*
@@ -778,87 +197,6 @@ void
 StrategyHintVacuum(bool vacuum_active)
 {
 	strategy_hint_vacuum = vacuum_active;
-	strategy_vacuum_xid = GetTopTransactionId();
-}
-/*
- * StrategyDirtyBufferList
- *
- * Returns a list of dirty buffers, in priority order for writing.
- * Note that the caller may choose not to write them all.
- *
- * The caller must beware of the possibility that a buffer is no longer dirty,
- * or even contains a different page, by the time he reaches it.  If it no
- * longer contains the same page it need not be written, even if it is (again)
- * dirty.
- *
- * Buffer pointers are stored into buffers[], and corresponding tags into
- * buftags[], both of size max_buffers.  The function returns the number of
- * buffer IDs stored.
- */
-int
-StrategyDirtyBufferList(BufferDesc **buffers, BufferTag *buftags,
-						int max_buffers)
-{
-	int			num_buffer_dirty = 0;
-	int			cdb_id_t1;
-	int			cdb_id_t2;
-	int			buf_id;
-	BufferDesc *buf;
-	/*
-	 * Traverse the T1 and T2 list LRU to MRU in "parallel" and add all
-	 * dirty buffers found in that order to the list. The ARC strategy
-	 * keeps all used buffers including pinned ones in the T1 or T2 list.
-	 * So we cannot miss any dirty buffers.
-	 */
-	cdb_id_t1 = StrategyControl->listHead[STRAT_LIST_T1];
-	cdb_id_t2 = StrategyControl->listHead[STRAT_LIST_T2];
-	while (cdb_id_t1 >= 0 || cdb_id_t2 >= 0)
-	{
-		if (cdb_id_t1 >= 0)
-		{
-			buf_id = StrategyCDB[cdb_id_t1].buf_id;
-			buf = &BufferDescriptors[buf_id];
-			if (buf->flags & BM_VALID)
-			{
-				if ((buf->flags & BM_DIRTY) || (buf->cntxDirty))
-				{
-					buffers[num_buffer_dirty] = buf;
-					buftags[num_buffer_dirty] = buf->tag;
-					num_buffer_dirty++;
-					if (num_buffer_dirty >= max_buffers)
-						break;
-				}
-			}
-			cdb_id_t1 = StrategyCDB[cdb_id_t1].next;
-		}
-		if (cdb_id_t2 >= 0)
-		{
-			buf_id = StrategyCDB[cdb_id_t2].buf_id;
-			buf = &BufferDescriptors[buf_id];
-			if (buf->flags & BM_VALID)
-			{
-				if ((buf->flags & BM_DIRTY) || (buf->cntxDirty))
-				{
-					buffers[num_buffer_dirty] = buf;
-					buftags[num_buffer_dirty] = buf->tag;
-					num_buffer_dirty++;
-					if (num_buffer_dirty >= max_buffers)
-						break;
-				}
-			}
-			cdb_id_t2 = StrategyCDB[cdb_id_t2].next;
-		}
-	}
-	return num_buffer_dirty;
 }
@@ -866,21 +204,21 @@ StrategyDirtyBufferList(BufferDesc **buffers, BufferTag *buftags,
 * StrategyShmemSize
 *
 * estimate the size of shared memory used by the freelist-related structures.
+ *
+ * Note: for somewhat historical reasons, the buffer lookup hashtable size
+ * is also determined here.
 */
 int
 StrategyShmemSize(void)
 {
 	int			size = 0;
-	/* size of CDB lookup hash table */
+	/* size of lookup hash table */
-	size += BufTableShmemSize(NBuffers * 2);
+	size += BufTableShmemSize(NBuffers);
 	/* size of the shared replacement strategy control block */
 	size += MAXALIGN(sizeof(BufferStrategyControl));
-	/* size of the ARC directory blocks */
-	size += MAXALIGN(NBuffers * 2 * sizeof(BufferStrategyCDB));
 	return size;
 }
@@ -888,29 +226,26 @@ StrategyShmemSize(void)
 * StrategyInitialize -- initialize the buffer cache replacement
 *		strategy.
 *
- * Assume: All of the buffers are already building a linked list.
+ * Assumes: All of the buffers are already built into a linked list.
 *		Only called by postmaster and only during initialization.
 */
 void
 StrategyInitialize(bool init)
 {
 	bool		found;
-	int			i;
 	/*
-	 * Initialize the shared CDB lookup hashtable
+	 * Initialize the shared buffer lookup hashtable.
 	 */
-	InitBufTable(NBuffers * 2);
+	InitBufTable(NBuffers);
 	/*
-	 * Get or create the shared strategy control block and the CDB's
+	 * Get or create the shared strategy control block
 	 */
 	StrategyControl = (BufferStrategyControl *)
 		ShmemInitStruct("Buffer Strategy Status",
-						sizeof(BufferStrategyControl) +
+						sizeof(BufferStrategyControl),
-						sizeof(BufferStrategyCDB) * (NBuffers * 2 - 1),
 						&found);
-	StrategyCDB = &(StrategyControl->cdb[0]);
 	if (!found)
 	{
@@ -923,39 +258,11 @@ StrategyInitialize(bool init)
 		 * Grab the whole linked list of free buffers for our strategy. We
 		 * assume it was previously set up by InitBufferPool().
 		 */
-		StrategyControl->listFreeBuffers = 0;
+		StrategyControl->firstFreeBuffer = 0;
+		StrategyControl->lastFreeBuffer = NBuffers - 1;
-		/*
+		/* Initialize the clock sweep pointer */
-		 * We start off with a target T1 list size of half the available
+		StrategyControl->nextVictimBuffer = 0;
-		 * cache blocks.
-		 */
-		StrategyControl->target_T1_size = NBuffers / 2;
-		/*
-		 * Initialize B1, T1, T2 and B2 lists to be empty
-		 */
-		for (i = 0; i < STRAT_NUM_LISTS; i++)
-		{
-			StrategyControl->listHead[i] = -1;
-			StrategyControl->listTail[i] = -1;
-			StrategyControl->listSize[i] = 0;
-			StrategyControl->num_hit[i] = 0;
-		}
-		StrategyControl->num_lookup = 0;
-		StrategyControl->stat_report = 0;
-		/*
-		 * All CDB's are linked as the listUnusedCDB
-		 */
-		for (i = 0; i < NBuffers * 2; i++)
-		{
-			StrategyCDB[i].next = i + 1;
-			StrategyCDB[i].list = STRAT_LIST_UNUSED;
-			CLEAR_BUFFERTAG(StrategyCDB[i].buf_tag);
-			StrategyCDB[i].buf_id = -1;
-		}
-		StrategyCDB[NBuffers * 2 - 1].next = -1;
-		StrategyControl->listUnusedCDB = 0;
 	}
 	else
 		Assert(!init);

--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -9,7 +9,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.62 2005/01/10 20:02:21 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.63 2005/03/04 20:21:06 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -24,6 +24,10 @@
 /*#define LBDEBUG*/
+/* Note: this macro only works on local buffers, not shared ones! */
+#define LocalBufHdrGetBlock(bufHdr)	\
+	LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
 /* should be a GUC parameter some day */
 int			NLocBuffer = 64;
@@ -39,7 +43,7 @@ static int	nextFreeLocalBuf = 0;
 *	  allocate a local buffer. We do round robin allocation for now.
 *
 * API is similar to bufmgr.c's BufferAlloc, except that we do not need
- * to have the BufMgrLock since this is all local.	Also, IO_IN_PROGRESS
+ * to do any locking since this is all local.	Also, IO_IN_PROGRESS
 * does not get set.
 */
 BufferDesc *
@@ -47,11 +51,12 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
 {
 	BufferTag	newTag;			/* identity of requested block */
 	int			i;
+	int			trycounter;
 	BufferDesc *bufHdr;
 	INIT_BUFFERTAG(newTag, reln, blockNum);
-	/* a low tech search for now -- not optimized for scans */
+	/* a low tech search for now -- should use a hashtable */
 	for (i = 0; i < NLocBuffer; i++)
 	{
 		bufHdr = &LocalBufferDescriptors[i];
@@ -81,32 +86,44 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
 			RelationGetRelid(reln), blockNum, -nextFreeLocalBuf - 1);
 #endif
-	/* need to get a new buffer (round robin for now) */
+	/*
-	bufHdr = NULL;
+	 * Need to get a new buffer.  We use a clock sweep algorithm
-	for (i = 0; i < NLocBuffer; i++)
+	 * (essentially the same as what freelist.c does now...)
+	 */
+	trycounter = NLocBuffer;
+	for (;;)
 	{
-		int			b = (nextFreeLocalBuf + i) % NLocBuffer;
+		int			b = nextFreeLocalBuf;
+		if (++nextFreeLocalBuf >= NLocBuffer)
+			nextFreeLocalBuf = 0;
-		if (LocalRefCount[b] == 0)
-		{
 		bufHdr = &LocalBufferDescriptors[b];
+		if (LocalRefCount[b] == 0 && bufHdr->usage_count == 0)
+		{
 			LocalRefCount[b]++;
 			ResourceOwnerRememberBuffer(CurrentResourceOwner,
 										BufferDescriptorGetBuffer(bufHdr));
-			nextFreeLocalBuf = (b + 1) % NLocBuffer;
 			break;
 		}
+		if (bufHdr->usage_count > 0)
+		{
+			bufHdr->usage_count--;
+			trycounter = NLocBuffer;
 		}
-	if (bufHdr == NULL)
+		else if (--trycounter == 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
 					 errmsg("no empty local buffer available")));
+	}
 	/*
 	 * this buffer is not referenced but it might still be dirty. if
 	 * that's the case, write it out before reusing it!
 	 */
-	if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty)
+	if (bufHdr->flags & BM_DIRTY)
 	{
 		SMgrRelation oreln;
@@ -116,7 +133,7 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
 		/* And write... */
 		smgrwrite(oreln,
 				  bufHdr->tag.blockNum,
-				  (char *) MAKE_PTR(bufHdr->data),
+				  (char *) LocalBufHdrGetBlock(bufHdr),
 				  true);
 		LocalBufferFlushCount++;
@@ -129,7 +146,7 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
 	 * use, so it's okay to do it (and possibly error out) before marking
 	 * the buffer as not dirty.
 	 */
-	if (bufHdr->data == (SHMEM_OFFSET) 0)
+	if (LocalBufHdrGetBlock(bufHdr) == NULL)
 	{
 		char	   *data = (char *) malloc(BLCKSZ);
@@ -138,17 +155,10 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
 					(errcode(ERRCODE_OUT_OF_MEMORY),
 					 errmsg("out of memory")));
-		/*
-		 * This is a bit of a hack: bufHdr->data needs to be a shmem
-		 * offset for consistency with the shared-buffer case, so make it
-		 * one even though it's not really a valid shmem offset.
-		 */
-		bufHdr->data = MAKE_OFFSET(data);
 		/*
 		 * Set pointer for use by BufferGetBlock() macro.
 		 */
-		LocalBufferBlockPointers[-(bufHdr->buf_id + 2)] = (Block) data;
+		LocalBufHdrGetBlock(bufHdr) = (Block) data;
 	}
 	/*
@@ -156,7 +166,8 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
 	 */
 	bufHdr->tag = newTag;
 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
-	bufHdr->cntxDirty = false;
+	bufHdr->flags |= BM_TAG_VALID;
+	bufHdr->usage_count = 0;
 	*foundPtr = FALSE;
 	return bufHdr;
@@ -170,6 +181,7 @@ void
 WriteLocalBuffer(Buffer buffer, bool release)
 {
 	int			bufid;
+	BufferDesc *bufHdr;
 	Assert(BufferIsLocal(buffer));
@@ -178,12 +190,18 @@ WriteLocalBuffer(Buffer buffer, bool release)
 #endif
 	bufid = -(buffer + 1);
-	LocalBufferDescriptors[bufid].flags |= BM_DIRTY;
+	Assert(LocalRefCount[bufid] > 0);
+	bufHdr = &LocalBufferDescriptors[bufid];
+	bufHdr->flags |= BM_DIRTY;
 	if (release)
 	{
-		Assert(LocalRefCount[bufid] > 0);
 		LocalRefCount[bufid]--;
+		if (LocalRefCount[bufid] == 0 &&
+			bufHdr->usage_count < BM_MAX_USAGE_COUNT)
+			bufHdr->usage_count++;
 		ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
 	}
 }

--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -10,7 +10,7 @@
 * Written by Peter Eisentraut <peter_e@gmx.net>.
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.253 2005/03/01 20:23:34 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.254 2005/03/04 20:21:06 tgl Exp $
 *
 *--------------------------------------------------------------------
 */
@@ -77,7 +77,6 @@ extern bool Log_disconnections;
 extern DLLIMPORT bool check_function_bodies;
 extern int	CommitDelay;
 extern int	CommitSiblings;
-extern int	DebugSharedBuffers;
 extern char *default_tablespace;
 static const char *assign_log_destination(const char *value,
@@ -1230,15 +1229,6 @@ static struct config_int ConfigureNamesInt[] =
 		-1, -1, INT_MAX / 1000, NULL, NULL
 	},
-	{
-		{"debug_shared_buffers", PGC_POSTMASTER, STATS_MONITORING,
-			gettext_noop("Interval to report shared buffer status in seconds"),
-			NULL
-		},
-		&DebugSharedBuffers,
-		0, 0, 600, NULL, NULL
-	},
 	{
 		{"bgwriter_delay", PGC_SIGHUP, RESOURCES,
 			gettext_noop("Background writer sleep time between rounds in milliseconds"),
@@ -1249,21 +1239,21 @@ static struct config_int ConfigureNamesInt[] =
 	},
 	{
-		{"bgwriter_percent", PGC_SIGHUP, RESOURCES,
+		{"bgwriter_lru_maxpages", PGC_SIGHUP, RESOURCES,
-			gettext_noop("Background writer percentage of dirty buffers to flush per round"),
+			gettext_noop("Background writer maximum number of all pages to flush per round"),
 			NULL
 		},
-		&BgWriterPercent,
+		&bgwriter_lru_maxpages,
-		1, 0, 100, NULL, NULL
+		5, 0, 1000, NULL, NULL
 	},
 	{
-		{"bgwriter_maxpages", PGC_SIGHUP, RESOURCES,
+		{"bgwriter_all_maxpages", PGC_SIGHUP, RESOURCES,
-			gettext_noop("Background writer maximum number of pages to flush per round"),
+			gettext_noop("Background writer maximum number of LRU pages to flush per round"),
 			NULL
 		},
-		&BgWriterMaxPages,
+		&bgwriter_all_maxpages,
-		100, 0, 1000, NULL, NULL
+		5, 0, 1000, NULL, NULL
 	},
 	{
@@ -1394,6 +1384,24 @@ static struct config_real ConfigureNamesReal[] =
 		MAX_GEQO_SELECTION_BIAS, NULL, NULL
 	},
+	{
+		{"bgwriter_lru_percent", PGC_SIGHUP, RESOURCES,
+			gettext_noop("Background writer percentage of LRU buffers to flush per round"),
+			NULL
+		},
+		&bgwriter_lru_percent,
+		1.0, 0.0, 100.0, NULL, NULL
+	},
+	{
+		{"bgwriter_all_percent", PGC_SIGHUP, RESOURCES,
+			gettext_noop("Background writer percentage of all buffers to flush per round"),
+			NULL
+		},
+		&bgwriter_all_percent,
+		0.333, 0.0, 100.0, NULL, NULL
+	},
 	{
 		{"seed", PGC_USERSET, UNGROUPED,
 			gettext_noop("Sets the seed for random-number generation."),

--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -99,8 +99,10 @@
 # - Background writer -
 #bgwriter_delay = 200		# 10-10000 milliseconds between rounds
-#bgwriter_percent = 1		# 0-100% of dirty buffers in each round
+#bgwriter_lru_percent = 1.0	# 0-100% of LRU buffers scanned in each round
-#bgwriter_maxpages = 100	# 0-1000 buffers max per round
+#bgwriter_lru_maxpages = 5	# 0-1000 buffers max written per round
+#bgwriter_all_percent = 0.333	# 0-100% of all buffers scanned in each round
+#bgwriter_all_maxpages = 5	# 0-1000 buffers max written per round
 #---------------------------------------------------------------------------

--- a/src/backend/utils/resowner/resowner.c
+++ b/src/backend/utils/resowner/resowner.c
@@ -14,7 +14,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/resowner/resowner.c,v 1.9 2004/12/31 22:02:50 pgsql Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/resowner/resowner.c,v 1.10 2005/03/04 20:21:06 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -200,12 +200,7 @@ ResourceOwnerReleaseInternal(ResourceOwner owner,
 		 * that would indicate failure to clean up the executor correctly ---
 		 * so issue warnings.  In the abort case, just clean up quietly.
 		 *
-		 * XXX this is fairly inefficient due to multiple BufMgrLock
+		 * We are careful to do the releasing back-to-front, so as to
-		 * grabs if there are lots of buffers to be released, but we
-		 * don't expect many (indeed none in the success case) so it's
-		 * probably not worth optimizing.
-		 *
-		 * We are however careful to release back-to-front, so as to
 		 * avoid O(N^2) behavior in ResourceOwnerForgetBuffer().
 		 */
 		while (owner->nbuffers > 0)

--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -5,7 +5,7 @@
 *
 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
 *
- * $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.4 2004/12/31 22:03:39 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.5 2005/03/04 20:21:06 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -18,8 +18,6 @@
 /* GUC options */
 extern int	BgWriterDelay;
-extern int	BgWriterPercent;
-extern int	BgWriterMaxPages;
 extern int	CheckPointTimeout;
 extern int	CheckPointWarning;

--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -8,7 +8,7 @@
 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.76 2005/02/03 23:29:19 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.77 2005/03/04 20:21:07 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -19,24 +19,39 @@
 #include "storage/buf.h"
 #include "storage/lwlock.h"
 #include "storage/shmem.h"
+#include "storage/spin.h"
 #include "utils/rel.h"
 /*
 * Flags for buffer descriptors
+ *
+ * Note: TAG_VALID essentially means that there is a buffer hashtable
+ * entry associated with the buffer's tag.
 */
 #define BM_DIRTY				(1 << 0)		/* data needs writing */
 #define BM_VALID				(1 << 1)		/* data is valid */
-#define BM_IO_IN_PROGRESS		(1 << 2)		/* read or write in
+#define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
+#define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in
 												 * progress */
-#define BM_IO_ERROR				(1 << 3)		/* previous I/O failed */
+#define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
-#define BM_JUST_DIRTIED			(1 << 4)		/* dirtied since write
+#define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write
 												 * started */
-#define BM_PIN_COUNT_WAITER		(1 << 5)		/* have waiter for sole
+#define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole
 												 * pin */
 typedef bits16 BufFlags;
+/*
+ * The maximum allowed value of usage_count represents a tradeoff between
+ * accuracy and speed of the clock-sweep buffer management algorithm.  A
+ * large value (comparable to NBuffers) would approximate LRU semantics.
+ * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
+ * clock sweeps to find a free buffer, so in practice we don't want the
+ * value to be very large.
+ */
+#define BM_MAX_USAGE_COUNT	5
 /*
 * Buffer tag identifies which disk block the buffer contains.
 *
@@ -77,45 +92,81 @@ typedef struct buftag
 /*
 *	BufferDesc -- shared descriptor/state data for a single shared buffer.
+ *
+ * Note: buf_hdr_lock must be held to examine or change the tag, flags,
+ * usage_count, refcount, or wait_backend_id fields.  buf_id field never
+ * changes after initialization, so does not need locking.  freeNext is
+ * protected by the BufFreelistLock not buf_hdr_lock.  The LWLocks can take
+ * care of themselves.  The buf_hdr_lock is *not* used to control access to
+ * the data in the buffer!
+ *
+ * An exception is that if we have the buffer pinned, its tag can't change
+ * underneath us, so we can examine the tag without locking the spinlock.
+ * Also, in places we do one-time reads of the flags without bothering to
+ * lock the spinlock; this is generally for situations where we don't expect
+ * the flag bit being tested to be changing.
+ *
+ * We can't physically remove items from a disk page if another backend has
+ * the buffer pinned.  Hence, a backend may need to wait for all other pins
+ * to go away.  This is signaled by storing its own backend ID into
+ * wait_backend_id and setting flag bit BM_PIN_COUNT_WAITER.  At present,
+ * there can be only one such waiter per buffer.
+ *
+ * We use this same struct for local buffer headers, but the lock fields
+ * are not used and not all of the flag bits are useful either.
 */
 typedef struct sbufdesc
 {
-	Buffer		bufNext;		/* link in freelist chain */
+	BufferTag	tag;			/* ID of page contained in buffer */
-	SHMEM_OFFSET data;			/* pointer to data in buf pool */
-	/* tag and id must be together for table lookup (still true?) */
-	BufferTag	tag;			/* file/block identifier */
-	int			buf_id;			/* buffer's index number (from 0) */
 	BufFlags	flags;			/* see bit definitions above */
+	uint16		usage_count;	/* usage counter for clock sweep code */
 	unsigned	refcount;		/* # of backends holding pins on buffer */
+	BackendId	wait_backend_id;	/* backend ID of pin-count waiter */
-	LWLockId	io_in_progress_lock;	/* to wait for I/O to complete */
+	slock_t		buf_hdr_lock;	/* protects the above fields */
-	LWLockId	cntx_lock;		/* to lock access to page context */
-	bool		cntxDirty;		/* new way to mark block as dirty */
+	int			buf_id;			/* buffer's index number (from 0) */
+	int			freeNext;		/* link in freelist chain */
-	/*
+	LWLockId	io_in_progress_lock;	/* to wait for I/O to complete */
-	 * We can't physically remove items from a disk page if another
+	LWLockId	content_lock;	/* to lock access to buffer contents */
-	 * backend has the buffer pinned.  Hence, a backend may need to wait
-	 * for all other pins to go away.  This is signaled by storing its own
-	 * backend ID into wait_backend_id and setting flag bit
-	 * BM_PIN_COUNT_WAITER. At present, there can be only one such waiter
-	 * per buffer.
-	 */
-	BackendId	wait_backend_id;	/* backend ID of pin-count waiter */
 } BufferDesc;
 #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
+/*
+ * The freeNext field is either the index of the next freelist entry,
+ * or one of these special values:
+ */
+#define FREENEXT_END_OF_LIST	(-1)
+#define FREENEXT_NOT_IN_LIST	(-2)
+/*
+ * Macros for acquiring/releasing a buffer header's spinlock.  The
+ * NoHoldoff cases may be used when we know that we hold some LWLock
+ * and therefore interrupts are already held off.  Do not apply these
+ * to local buffers!
+ */
+#define LockBufHdr(bufHdr)  \
+	SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
+#define UnlockBufHdr(bufHdr)  \
+	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
+#define LockBufHdr_NoHoldoff(bufHdr)  \
+	SpinLockAcquire_NoHoldoff(&(bufHdr)->buf_hdr_lock)
+#define UnlockBufHdr_NoHoldoff(bufHdr)  \
+	SpinLockRelease_NoHoldoff(&(bufHdr)->buf_hdr_lock)
-/* in bufmgr.c */
+/* in buf_init.c */
 extern BufferDesc *BufferDescriptors;
 /* in localbuf.c */
 extern BufferDesc *LocalBufferDescriptors;
-/* counters in buf_init.c */
+/* in freelist.c */
+extern bool strategy_hint_vacuum;
+/* event counters in buf_init.c */
 extern long int ReadBufferCount;
 extern long int ReadLocalBufferCount;
 extern long int BufferHitCount;
@@ -129,15 +180,9 @@ extern long int LocalBufferFlushCount;
 */
 /* freelist.c */
-extern BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck,
+extern BufferDesc *StrategyGetBuffer(void);
-					 int *cdb_found_index);
+extern void StrategyFreeBuffer(BufferDesc *buf, bool at_head);
-extern BufferDesc *StrategyGetBuffer(int *cdb_replace_index);
+extern int	StrategySyncStart(void);
-extern void StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
-					  int cdb_found_index, int cdb_replace_index);
-extern void StrategyInvalidateBuffer(BufferDesc *buf);
-extern void StrategyHintVacuum(bool vacuum_active);
-extern int StrategyDirtyBufferList(BufferDesc **buffers, BufferTag *buftags,
-						int max_buffers);
 extern int	StrategyShmemSize(void);
 extern void StrategyInitialize(bool init);
@@ -145,7 +190,7 @@ extern void StrategyInitialize(bool init);
 extern int	BufTableShmemSize(int size);
 extern void InitBufTable(int size);
 extern int	BufTableLookup(BufferTag *tagPtr);
-extern void BufTableInsert(BufferTag *tagPtr, int buf_id);
+extern int	BufTableInsert(BufferTag *tagPtr, int buf_id);
 extern void BufTableDelete(BufferTag *tagPtr);
 /* localbuf.c */

--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.89 2004/12/31 22:03:42 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.90 2005/03/04 20:21:07 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -27,21 +27,25 @@ extern DLLIMPORT int NBuffers;
 /* in bufmgr.c */
 extern bool zero_damaged_pages;
+extern double bgwriter_lru_percent;
+extern double bgwriter_all_percent;
+extern int	bgwriter_lru_maxpages;
+extern int	bgwriter_all_maxpages;
 /* in buf_init.c */
 extern DLLIMPORT Block *BufferBlockPointers;
-extern int32 *PrivateRefCount;
+extern DLLIMPORT int32 *PrivateRefCount;
 /* in localbuf.c */
 extern DLLIMPORT int NLocBuffer;
 extern DLLIMPORT Block *LocalBufferBlockPointers;
-extern int32 *LocalRefCount;
+extern DLLIMPORT int32 *LocalRefCount;
 /* special block number for ReadBuffer() */
 #define P_NEW	InvalidBlockNumber		/* grow the file to get a new page */
 /*
- * Buffer context lock modes
+ * Buffer content lock modes (mode argument for LockBuffer())
 */
 #define BUFFER_LOCK_UNLOCK		0
 #define BUFFER_LOCK_SHARE		1
@@ -150,8 +154,12 @@ extern void LockBufferForCleanup(Buffer buffer);
 extern void AbortBufferIO(void);
 extern void BufmgrCommit(void);
-extern int	BufferSync(int percent, int maxpages);
+extern void	BufferSync(void);
+extern void BgBufferSync(void);
 extern void InitLocalBuffer(void);
+/* in freelist.c */
+extern void StrategyHintVacuum(bool vacuum_active);
 #endif
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.16 2004/12/31 22:03:42 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.17 2005/03/04 20:21:07 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -25,7 +25,8 @@
 */
 typedef enum LWLockId
 {
-	BufMgrLock,
+	BufMappingLock,
+	BufFreelistLock,
 	LockMgrLock,
 	OidGenLock,
 	XidGenLock,