vacuumlazy.c 36.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/*-------------------------------------------------------------------------
 *
 * vacuumlazy.c
 *	  Concurrent ("lazy") vacuuming.
 *
 *
 * The major space usage for LAZY VACUUM is storage for the array of dead
 * tuple TIDs, with the next biggest need being storage for per-disk-page
 * free space info.  We want to ensure we can vacuum even the very largest
 * relations with finite memory space usage.  To do that, we set upper bounds
 * on the number of tuples and pages we will keep track of at once.
 *
13 14 15 16 17
 * We are willing to use at most maintenance_work_mem memory space to keep
 * track of dead tuples.  We initially allocate an array of TIDs of that size.
 * If the array threatens to overflow, we suspend the heap scan phase and
 * perform a pass of index cleanup and page compaction, then resume the heap
 * scan with an empty TID array.
18 19 20
 *
 * We can limit the storage for page free space to MaxFSMPages entries,
 * since that's the most the free space map will be willing to remember
21 22
 * anyway.	If the relation has fewer than that many pages with free space,
 * life is easy: just build an array of per-page info.	If it has more,
23 24 25 26 27
 * we store the free space info as a heap ordered by amount of free space,
 * so that we can discard the pages with least free space to ensure we never
 * have more than MaxFSMPages entries in all.  The surviving page entries
 * are passed to the free space map at conclusion of the scan.
 *
28 29 30 31 32
 * If we're processing a table with no indexes, we can just vacuum each page
 * as we go; there's no need to save up multiple tuples to minimize the number
 * of index scans performed.  So we don't use maintenance_work_mem memory for
 * the TID array, just enough to hold as many heap tuples as fit on one page.
 *
33
 *
34
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
35 36 37 38
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
39
 *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.98 2007/09/20 21:43:27 tgl Exp $
40 41 42 43 44
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

45 46
#include <math.h>

47 48
#include "access/genam.h"
#include "access/heapam.h"
49
#include "access/transam.h"
50
#include "commands/dbcommands.h"
51 52
#include "commands/vacuum.h"
#include "miscadmin.h"
53
#include "pgstat.h"
54
#include "postmaster/autovacuum.h"
55
#include "storage/freespace.h"
56
#include "utils/lsyscache.h"
57
#include "utils/memutils.h"
58
#include "utils/pg_rusage.h"
59 60 61 62 63 64


/*
 * Space/time tradeoff parameters: do these need to be user-tunable?
 *
 * To consider truncating the relation, we want there to be at least
65 66
 * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
 * is less) potentially-freeable pages.
67
 */
68
#define REL_TRUNCATE_MINIMUM	1000
69 70 71 72 73
#define REL_TRUNCATE_FRACTION	16


typedef struct LVRelStats
{
74 75
	/* hasindex = true means two-pass strategy; false means one-pass */
	bool		hasindex;
76
	/* Overall statistics about rel */
77
	BlockNumber rel_pages;
78
	double		rel_tuples;
79
	BlockNumber pages_removed;
80
	double		tuples_deleted;
81
	BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
82
	Size		threshold;		/* minimum interesting free space */
83 84
	/* List of TIDs of tuples we intend to delete */
	/* NB: this list is ordered by TID address */
85 86
	int			num_dead_tuples;	/* current # of entries */
	int			max_dead_tuples;	/* # slots allocated in array */
87
	ItemPointer dead_tuples;	/* array of ItemPointerData */
88 89
	/* Array or heap of per-page info about free space */
	/* We use a simple array until it fills up, then convert to heap */
90 91
	bool		fs_is_heap;		/* are we using heap organization? */
	int			num_free_pages; /* current # of entries */
92
	int			max_free_pages; /* # slots allocated in array */
Bruce Momjian's avatar
Bruce Momjian committed
93
	PageFreeSpaceInfo *free_pages;		/* array or heap of blkno/avail */
Bruce Momjian's avatar
Bruce Momjian committed
94
	BlockNumber tot_free_pages; /* total pages with >= threshold space */
95
	int			num_index_scans;
96 97 98
} LVRelStats;


99
/* A few variables that don't seem worth passing around as parameters */
Bruce Momjian's avatar
Bruce Momjian committed
100
static int	elevel = -1;
101

102 103 104
static TransactionId OldestXmin;
static TransactionId FreezeLimit;

105 106
static BufferAccessStrategy vac_strategy;

107 108 109

/* non-export function prototypes */
static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
110
			   Relation *Irel, int nindexes);
111
static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
112
static void lazy_vacuum_index(Relation indrel,
Bruce Momjian's avatar
Bruce Momjian committed
113 114
				  IndexBulkDeleteResult **stats,
				  LVRelStats *vacrelstats);
115
static void lazy_cleanup_index(Relation indrel,
Bruce Momjian's avatar
Bruce Momjian committed
116 117
				   IndexBulkDeleteResult *stats,
				   LVRelStats *vacrelstats);
118 119
static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
				 int tupindex, LVRelStats *vacrelstats);
120
static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
121
static BlockNumber count_nondeletable_pages(Relation onerel,
122
						 LVRelStats *vacrelstats);
123
static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
124
static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
125
					   ItemPointer itemptr);
126
static void lazy_record_free_space(LVRelStats *vacrelstats,
127
					   BlockNumber page, Size avail);
128
static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
129 130
static void lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats);
static int	vac_cmp_itemptr(const void *left, const void *right);
131
static int	vac_cmp_page_spaces(const void *left, const void *right);
132 133 134 135 136 137


/*
 *	lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
 *
 *		This routine vacuums a single heap, cleans out its indexes, and
138
 *		updates its relpages and reltuples statistics.
139 140 141 142 143
 *
 *		At entry, we have already established a transaction and opened
 *		and locked the relation.
 */
void
144 145
lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
				BufferAccessStrategy bstrategy)
146 147 148 149
{
	LVRelStats *vacrelstats;
	Relation   *Irel;
	int			nindexes;
150
	BlockNumber possibly_freeable;
151 152 153 154 155 156 157 158
	PGRUsage	ru0;
	TimestampTz	starttime = 0;

	pg_rusage_init(&ru0);

	/* measure elapsed time iff autovacuum logging requires it */
	if (IsAutoVacuumWorkerProcess() && Log_autovacuum > 0)
		starttime = GetCurrentTimestamp();
159 160

	if (vacstmt->verbose)
161
		elevel = INFO;
162
	else
163
		elevel = DEBUG2;
Bruce Momjian's avatar
Bruce Momjian committed
164

165 166
	vac_strategy = bstrategy;

167
	vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared,
168
						  &OldestXmin, &FreezeLimit);
169

170
	vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
171

172 173 174 175
	/* Set threshold for interesting free space = average request size */
	/* XXX should we scale it up or down?  Adjust vacuum.c too, if so */
	vacrelstats->threshold = GetAvgFSMRequestSize(&onerel->rd_node);

176 177
	vacrelstats->num_index_scans = 0;

178
	/* Open all indexes of the relation */
179
	vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
180
	vacrelstats->hasindex = (nindexes > 0);
181 182

	/* Do the vacuuming */
183
	lazy_scan_heap(onerel, vacrelstats, Irel, nindexes);
184 185

	/* Done with indexes */
186
	vac_close_indexes(nindexes, Irel, NoLock);
187 188 189 190

	/*
	 * Optionally truncate the relation.
	 *
191 192
	 * Don't even think about it unless we have a shot at releasing a goodly
	 * number of pages.  Otherwise, the time taken isn't worth it.
193 194
	 */
	possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
195
	if (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
196
		possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)
197
		lazy_truncate_heap(onerel, vacrelstats);
198 199 200 201

	/* Update shared free space map with final free space info */
	lazy_update_fsm(onerel, vacrelstats);

202 203 204 205 206
	if (vacrelstats->tot_free_pages > MaxFSMPages)
		ereport(WARNING,
				(errmsg("relation \"%s.%s\" contains more than \"max_fsm_pages\" pages with useful free space",
						get_namespace_name(RelationGetNamespace(onerel)),
						RelationGetRelationName(onerel)),
207 208 209 210
				 errhint((vacrelstats->tot_free_pages > vacrelstats->rel_pages * 0.20 ?
							/* Only suggest VACUUM FULL if 20% free */
							"Consider using VACUUM FULL on this relation or increasing the configuration parameter \"max_fsm_pages\"." :
							"Consider increasing the configuration parameter \"max_fsm_pages\"."))));
211

212
	/* Update statistics in pg_class */
213 214 215
	vac_update_relstats(RelationGetRelid(onerel),
						vacrelstats->rel_pages,
						vacrelstats->rel_tuples,
216
						vacrelstats->hasindex,
217
						FreezeLimit);
218 219

	/* report results to the stats collector, too */
220
	pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared,
221
						 vacstmt->analyze, vacrelstats->rel_tuples);
222 223 224 225

	/* and log the action if appropriate */
	if (IsAutoVacuumWorkerProcess() && Log_autovacuum >= 0)
	{
226 227 228
		if (Log_autovacuum == 0 ||
			TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(),
									   Log_autovacuum))
229 230 231 232 233 234 235 236 237 238 239 240 241
			ereport(LOG,
					(errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"
							"pages: %d removed, %d remain\n"
							"tuples: %.0f removed, %.0f remain\n"
							"system usage: %s",
							get_database_name(MyDatabaseId),
							get_namespace_name(RelationGetNamespace(onerel)),
							RelationGetRelationName(onerel),
							vacrelstats->num_index_scans,
							vacrelstats->pages_removed, vacrelstats->rel_pages,
							vacrelstats->tuples_deleted, vacrelstats->rel_tuples, 
							pg_rusage_show(&ru0))));
	}
242 243 244 245 246 247 248 249 250
}


/*
 *	lazy_scan_heap() -- scan an open heap relation
 *
 *		This routine sets commit status bits, builds lists of dead tuples
 *		and pages with free space, and calculates statistics on the number
 *		of live tuples in the heap.  When done, or when we run low on space
251
 *		for dead-tuple TIDs, invoke vacuuming of indexes and heap.
252
 *
253 254
 *		If there are no indexes then we just vacuum each dirty page as we
 *		process it, since there's no point in gathering many tuples.
255 256 257
 */
static void
lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
258
			   Relation *Irel, int nindexes)
259 260 261 262 263
{
	BlockNumber nblocks,
				blkno;
	HeapTupleData tuple;
	char	   *relname;
264 265
	BlockNumber empty_pages,
				vacuumed_pages;
266 267 268 269
	double		num_tuples,
				tups_vacuumed,
				nkeep,
				nunused;
270
	IndexBulkDeleteResult **indstats;
271
	int			i;
272
	PGRUsage	ru0;
273

274
	pg_rusage_init(&ru0);
275 276

	relname = RelationGetRelationName(onerel);
277 278 279 280
	ereport(elevel,
			(errmsg("vacuuming \"%s.%s\"",
					get_namespace_name(RelationGetNamespace(onerel)),
					relname)));
281

282
	empty_pages = vacuumed_pages = 0;
283 284
	num_tuples = tups_vacuumed = nkeep = nunused = 0;

285 286
	indstats = (IndexBulkDeleteResult **)
		palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
287

288 289 290 291
	nblocks = RelationGetNumberOfBlocks(onerel);
	vacrelstats->rel_pages = nblocks;
	vacrelstats->nonempty_pages = 0;

292
	lazy_space_alloc(vacrelstats, nblocks);
293 294 295 296 297 298 299

	for (blkno = 0; blkno < nblocks; blkno++)
	{
		Buffer		buf;
		Page		page;
		OffsetNumber offnum,
					maxoff;
300
		bool		tupgone,
301 302
					hastup;
		int			prev_dead_count;
303 304
		OffsetNumber frozen[MaxOffsetNumber];
		int			nfrozen;
305

306
		vacuum_delay_point();
Jan Wieck's avatar
Jan Wieck committed
307

308
		/*
309 310
		 * If we are close to overrunning the available space for dead-tuple
		 * TIDs, pause and do a cycle of vacuuming before we tackle this page.
311
		 */
312
		if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
313 314 315 316
			vacrelstats->num_dead_tuples > 0)
		{
			/* Remove index entries */
			for (i = 0; i < nindexes; i++)
317
				lazy_vacuum_index(Irel[i],
318
								  &indstats[i],
319
								  vacrelstats);
320 321 322 323
			/* Remove tuples from heap */
			lazy_vacuum_heap(onerel, vacrelstats);
			/* Forget the now-vacuumed tuples, and press on */
			vacrelstats->num_dead_tuples = 0;
324
			vacrelstats->num_index_scans++;
325 326
		}

327
		buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
328

329 330
		/* We need buffer cleanup lock so that we can prune HOT chains. */
		LockBufferForCleanup(buf);
331 332 333 334 335

		page = BufferGetPage(buf);

		if (PageIsNew(page))
		{
336
			/*
337 338 339
			 * An all-zeroes page could be left over if a backend extends the
			 * relation but crashes before initializing the page. Reclaim such
			 * pages for use.
340
			 *
341 342 343
			 * We have to be careful here because we could be looking at a
			 * page that someone has just added to the relation and not yet
			 * been able to initialize (see RelationGetBufferForTuple). To
344 345 346 347
			 * protect against that, release the buffer lock, grab the
			 * relation extension lock momentarily, and re-lock the buffer.
			 * If the page is still uninitialized by then, it must be left
			 * over from a crashed backend, and we can initialize it.
348
			 *
349 350 351
			 * We don't really need the relation lock when this is a new or
			 * temp relation, but it's probably not worth the code space to
			 * check that, since this surely isn't a critical path.
352
			 *
353 354
			 * Note: the comparable code in vacuum.c need not worry because
			 * it's got exclusive lock on the whole relation.
355
			 */
356
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
357 358
			LockRelationForExtension(onerel, ExclusiveLock);
			UnlockRelationForExtension(onerel, ExclusiveLock);
359
			LockBufferForCleanup(buf);
360 361
			if (PageIsNew(page))
			{
362
				ereport(WARNING,
363 364
				(errmsg("relation \"%s\" page %u is uninitialized --- fixing",
						relname, blkno)));
365
				PageInit(page, BufferGetPageSize(buf), 0);
366
				empty_pages++;
367
				lazy_record_free_space(vacrelstats, blkno,
368
									   PageGetHeapFreeSpace(page));
369
			}
370 371
			MarkBufferDirty(buf);
			UnlockReleaseBuffer(buf);
372 373 374 375 376 377 378
			continue;
		}

		if (PageIsEmpty(page))
		{
			empty_pages++;
			lazy_record_free_space(vacrelstats, blkno,
379
								   PageGetHeapFreeSpace(page));
380
			UnlockReleaseBuffer(buf);
381 382 383
			continue;
		}

384 385 386 387 388 389 390 391 392 393 394 395
		/* 
		 * Prune all HOT-update chains in this page.
		 *
		 * We count tuples removed by the pruning step as removed by VACUUM.
		 */
		tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin,
										 false, false);

		/*
		 * Now scan the page to collect vacuumable items and check for
		 * tuples requiring freezing.
		 */
396
		nfrozen = 0;
397 398 399 400 401 402 403 404 405 406 407
		hastup = false;
		prev_dead_count = vacrelstats->num_dead_tuples;
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
		{
			ItemId		itemid;

			itemid = PageGetItemId(page, offnum);

408
			/* Unused items require no processing, but we count 'em */
409 410 411 412 413 414
			if (!ItemIdIsUsed(itemid))
			{
				nunused += 1;
				continue;
			}

415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437
			/* Redirect items mustn't be touched */
 			if (ItemIdIsRedirected(itemid))
 			{
				hastup = true;	/* this page won't be truncatable */
 				continue;
 			}

 			ItemPointerSet(&(tuple.t_self), blkno, offnum);

			/*
			 * DEAD item pointers are to be vacuumed normally; but we don't
			 * count them in tups_vacuumed, else we'd be double-counting
			 * (at least in the common case where heap_page_prune() just
			 * freed up a non-HOT tuple).
			 */
			if (ItemIdIsDead(itemid))
			{
				lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
				continue;
			}

			Assert(ItemIdIsNormal(itemid));

438 439 440 441 442
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple.t_len = ItemIdGetLength(itemid);

			tupgone = false;

443
			switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
444 445
			{
				case HEAPTUPLE_DEAD:
446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465
					/*
					 * Ordinarily, DEAD tuples would have been removed by
					 * heap_page_prune(), but it's possible that the tuple
					 * state changed since heap_page_prune() looked.  In
					 * particular an INSERT_IN_PROGRESS tuple could have
					 * changed to DEAD if the inserter aborted.  So this
					 * cannot be considered an error condition.
					 *
					 * If the tuple is HOT-updated then it must only be
					 * removed by a prune operation; so we keep it just as
					 * if it were RECENTLY_DEAD.  Also, if it's a heap-only
					 * tuple, we choose to keep it, because it'll be a
					 * lot cheaper to get rid of it in the next pruning pass
					 * than to treat it like an indexed tuple.
					 */
					if (HeapTupleIsHotUpdated(&tuple) ||
						HeapTupleIsHeapOnly(&tuple))
						nkeep += 1;
					else
						tupgone = true;		/* we can delete the tuple */
466 467
					break;
				case HEAPTUPLE_LIVE:
468
					/* Tuple is good --- but let's do some validity checks */
469 470 471 472
					if (onerel->rd_rel->relhasoids &&
						!OidIsValid(HeapTupleGetOid(&tuple)))
						elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
							 relname, blkno, offnum);
473 474
					break;
				case HEAPTUPLE_RECENTLY_DEAD:
475

476
					/*
477 478
					 * If tuple is recently deleted then we must not remove it
					 * from relation.
479 480 481 482 483 484 485 486 487 488
					 */
					nkeep += 1;
					break;
				case HEAPTUPLE_INSERT_IN_PROGRESS:
					/* This is an expected case during concurrent vacuum */
					break;
				case HEAPTUPLE_DELETE_IN_PROGRESS:
					/* This is an expected case during concurrent vacuum */
					break;
				default:
489
					elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
490 491 492 493 494 495 496 497 498 499 500 501
					break;
			}

			if (tupgone)
			{
				lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
				tups_vacuumed += 1;
			}
			else
			{
				num_tuples += 1;
				hastup = true;
502

Bruce Momjian's avatar
Bruce Momjian committed
503
				/*
504
				 * Each non-removable tuple must be checked to see if it
505
				 * needs freezing.  Note we already have exclusive buffer lock.
506
				 */
507
				if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
508
									  InvalidBuffer))
509
					frozen[nfrozen++] = offnum;
510
			}
511
		}						/* scan along page */
512

513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
		/*
		 * If we froze any tuples, mark the buffer dirty, and write a WAL
		 * record recording the changes.  We must log the changes to be
		 * crash-safe against future truncation of CLOG.
		 */
		if (nfrozen > 0)
		{
			MarkBufferDirty(buf);
			/* no XLOG for temp tables, though */
			if (!onerel->rd_istemp)
			{
				XLogRecPtr	recptr;

				recptr = log_heap_freeze(onerel, buf, FreezeLimit,
										 frozen, nfrozen);
				PageSetLSN(page, recptr);
				PageSetTLI(page, ThisTimeLineID);
			}
		}

533 534 535 536 537 538 539 540 541 542 543 544 545 546
		/*
		 * If there are no indexes then we can vacuum the page right now
		 * instead of doing a second scan.
		 */
		if (nindexes == 0 &&
			vacrelstats->num_dead_tuples > 0)
		{
			/* Remove tuples from heap */
			lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats);
			/* Forget the now-vacuumed tuples, and press on */
			vacrelstats->num_dead_tuples = 0;
			vacuumed_pages++;
		}

547
		/*
548
		 * If we remembered any tuples for deletion, then the page will be
549 550
		 * visited again by lazy_vacuum_heap, which will compute and record
		 * its post-compaction free space.	If not, then we're done with this
Bruce Momjian's avatar
Bruce Momjian committed
551 552
		 * page, so remember its free space as-is.	(This path will always be
		 * taken if there are no indexes.)
553 554 555 556
		 */
		if (vacrelstats->num_dead_tuples == prev_dead_count)
		{
			lazy_record_free_space(vacrelstats, blkno,
557
								   PageGetHeapFreeSpace(page));
558 559 560 561 562 563
		}

		/* Remember the location of the last page with nonremovable tuples */
		if (hastup)
			vacrelstats->nonempty_pages = blkno + 1;

564
		UnlockReleaseBuffer(buf);
565 566
	}

567 568
	/* save stats for use later */
	vacrelstats->rel_tuples = num_tuples;
569
	vacrelstats->tuples_deleted = tups_vacuumed;
570

571
	/* If any tuples need to be deleted, perform final vacuum cycle */
572
	/* XXX put a threshold on min number of tuples here? */
573 574 575 576
	if (vacrelstats->num_dead_tuples > 0)
	{
		/* Remove index entries */
		for (i = 0; i < nindexes; i++)
577
			lazy_vacuum_index(Irel[i],
578
							  &indstats[i],
579
							  vacrelstats);
580 581
		/* Remove tuples from heap */
		lazy_vacuum_heap(onerel, vacrelstats);
582
		vacrelstats->num_index_scans++;
583
	}
584 585 586 587

	/* Do post-vacuum cleanup and statistics update for each index */
	for (i = 0; i < nindexes; i++)
		lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
588

589 590 591 592 593 594 595
	/* If no indexes, make log report that lazy_vacuum_heap would've made */
	if (vacuumed_pages)
		ereport(elevel,
				(errmsg("\"%s\": removed %.0f row versions in %u pages",
						RelationGetRelationName(onerel),
						tups_vacuumed, vacuumed_pages)));

596
	ereport(elevel,
597
			(errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
598 599
					RelationGetRelationName(onerel),
					tups_vacuumed, num_tuples, nblocks),
600
			 errdetail("%.0f dead row versions cannot be removed yet.\n"
601
					   "There were %.0f unused item pointers.\n"
602
					   "%u pages contain useful free space.\n"
603
					   "%u pages are entirely empty.\n"
604
					   "%s.",
605 606
					   nkeep,
					   nunused,
607
					   vacrelstats->tot_free_pages,
608
					   empty_pages,
609
					   pg_rusage_show(&ru0))));
610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628
}


/*
 *	lazy_vacuum_heap() -- second pass over the heap
 *
 *		This routine marks dead tuples as unused and compacts out free
 *		space on their pages.  Pages not having dead tuples recorded from
 *		lazy_scan_heap are not visited at all.
 *
 * Note: the reason for doing this as a second pass is we cannot remove
 * the tuples until we've removed their index entries, and we want to
 * process index entry removal in batches as large as possible.
 */
static void
lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
{
	int			tupindex;
	int			npages;
629
	PGRUsage	ru0;
630

631
	pg_rusage_init(&ru0);
632 633 634 635 636
	npages = 0;

	tupindex = 0;
	while (tupindex < vacrelstats->num_dead_tuples)
	{
637
		BlockNumber tblk;
638 639 640
		Buffer		buf;
		Page		page;

641
		vacuum_delay_point();
Jan Wieck's avatar
Jan Wieck committed
642

643
		tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
644
		buf = ReadBufferWithStrategy(onerel, tblk, vac_strategy);
645 646 647 648 649
		LockBufferForCleanup(buf);
		tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats);
		/* Now that we've compacted the page, record its available space */
		page = BufferGetPage(buf);
		lazy_record_free_space(vacrelstats, tblk,
650
							   PageGetHeapFreeSpace(page));
651
		UnlockReleaseBuffer(buf);
652 653 654
		npages++;
	}

655
	ereport(elevel,
656
			(errmsg("\"%s\": removed %d row versions in %d pages",
657 658
					RelationGetRelationName(onerel),
					tupindex, npages),
659 660
			 errdetail("%s.",
					   pg_rusage_show(&ru0))));
661 662 663 664 665 666
}

/*
 *	lazy_vacuum_page() -- free dead tuples on a page
 *					 and repair its fragmentation.
 *
667
 * Caller must hold pin and buffer cleanup lock on the buffer.
668 669 670 671 672 673 674 675 676 677
 *
 * tupindex is the index in vacrelstats->dead_tuples of the first dead
 * tuple for this page.  We assume the rest follow sequentially.
 * The return value is the first tupindex after the tuples of this page.
 */
static int
lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
				 int tupindex, LVRelStats *vacrelstats)
{
	Page		page = BufferGetPage(buffer);
678 679
	OffsetNumber unused[MaxOffsetNumber];
	int			uncnt = 0;
680 681

	START_CRIT_SECTION();
682

683 684
	for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
	{
685 686
		BlockNumber tblk;
		OffsetNumber toff;
687
		ItemId		itemid;
688 689 690 691 692 693

		tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
		if (tblk != blkno)
			break;				/* past end of tuples for this block */
		toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
		itemid = PageGetItemId(page, toff);
694
		ItemIdSetUnused(itemid);
695
		unused[uncnt++] = toff;
696 697
	}

698
	PageRepairFragmentation(page);
699

700 701
	MarkBufferDirty(buffer);

702 703
	/* XLOG stuff */
	if (!onerel->rd_istemp)
704 705 706
	{
		XLogRecPtr	recptr;

707 708 709 710
		recptr = log_heap_clean(onerel, buffer,
								NULL, 0, NULL, 0,
								unused, uncnt,
								false);
711
		PageSetLSN(page, recptr);
712
		PageSetTLI(page, ThisTimeLineID);
713
	}
714

715 716 717 718 719
	END_CRIT_SECTION();

	return tupindex;
}

720
/*
721
 *	lazy_vacuum_index() -- vacuum one index relation.
722
 *
723 724
 *		Delete all the index entries pointing to tuples listed in
 *		vacrelstats->dead_tuples, and update running statistics.
725 726
 */
static void
727 728 729
lazy_vacuum_index(Relation indrel,
				  IndexBulkDeleteResult **stats,
				  LVRelStats *vacrelstats)
730
{
731
	IndexVacuumInfo ivinfo;
732
	PGRUsage	ru0;
733

734
	pg_rusage_init(&ru0);
735

736 737 738 739 740
	ivinfo.index = indrel;
	ivinfo.vacuum_full = false;
	ivinfo.message_level = elevel;
	/* We don't yet know rel_tuples, so pass -1 */
	ivinfo.num_heap_tuples = -1;
741
	ivinfo.strategy = vac_strategy;
742

743 744 745
	/* Do bulk deletion */
	*stats = index_bulk_delete(&ivinfo, *stats,
							   lazy_tid_reaped, (void *) vacrelstats);
746

747
	ereport(elevel,
748
			(errmsg("scanned index \"%s\" to remove %d row versions",
749
					RelationGetRelationName(indrel),
750 751
					vacrelstats->num_dead_tuples),
			 errdetail("%s.", pg_rusage_show(&ru0))));
752 753
}

754
/*
755
 *	lazy_cleanup_index() -- do post-vacuum cleanup for one index relation.
756 757
 */
static void
758 759 760
lazy_cleanup_index(Relation indrel,
				   IndexBulkDeleteResult *stats,
				   LVRelStats *vacrelstats)
761
{
762
	IndexVacuumInfo ivinfo;
763
	PGRUsage	ru0;
764

765
	pg_rusage_init(&ru0);
766

767 768 769 770
	ivinfo.index = indrel;
	ivinfo.vacuum_full = false;
	ivinfo.message_level = elevel;
	ivinfo.num_heap_tuples = vacrelstats->rel_tuples;
771
	ivinfo.strategy = vac_strategy;
772

773
	stats = index_vacuum_cleanup(&ivinfo, stats);
774 775 776 777

	if (!stats)
		return;

778
	/* now update statistics in pg_class */
779 780 781
	vac_update_relstats(RelationGetRelid(indrel),
						stats->num_pages,
						stats->num_index_tuples,
782
						false, InvalidTransactionId);
783

784
	ereport(elevel,
785 786 787 788 789 790 791 792 793 794
			(errmsg("index \"%s\" now contains %.0f row versions in %u pages",
					RelationGetRelationName(indrel),
					stats->num_index_tuples,
					stats->num_pages),
			 errdetail("%.0f index row versions were removed.\n"
			 "%u index pages have been deleted, %u are currently reusable.\n"
					   "%s.",
					   stats->tuples_removed,
					   stats->pages_deleted, stats->pages_free,
					   pg_rusage_show(&ru0))));
795

796
	pfree(stats);
797 798 799 800 801 802
}

/*
 * lazy_truncate_heap - try to truncate off any empty pages at the end
 */
static void
803
lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
804
{
805 806
	BlockNumber old_rel_pages = vacrelstats->rel_pages;
	BlockNumber new_rel_pages;
807
	PageFreeSpaceInfo *pageSpaces;
808 809 810
	int			n;
	int			i,
				j;
811
	PGRUsage	ru0;
812

813
	pg_rusage_init(&ru0);
814 815

	/*
816 817 818 819
	 * We need full exclusive lock on the relation in order to do truncation.
	 * If we can't get it, give up rather than waiting --- we don't want to
	 * block other backends, and we don't want to deadlock (which is quite
	 * possible considering we already hold a lower-grade lock).
820
	 */
821
	if (!ConditionalLockRelation(onerel, AccessExclusiveLock))
822 823 824 825
		return;

	/*
	 * Now that we have exclusive lock, look to see if the rel has grown
826 827
	 * whilst we were vacuuming with non-exclusive lock.  If so, give up; the
	 * newly added pages presumably contain non-deletable tuples.
828 829 830 831 832 833 834 835 836 837 838 839
	 */
	new_rel_pages = RelationGetNumberOfBlocks(onerel);
	if (new_rel_pages != old_rel_pages)
	{
		/* might as well use the latest news when we update pg_class stats */
		vacrelstats->rel_pages = new_rel_pages;
		UnlockRelation(onerel, AccessExclusiveLock);
		return;
	}

	/*
	 * Scan backwards from the end to verify that the end pages actually
840 841 842
	 * contain no tuples.  This is *necessary*, not optional, because other
	 * backends could have added tuples to these pages whilst we were
	 * vacuuming.
843
	 */
844
	new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
845 846 847 848 849 850 851 852 853 854 855

	if (new_rel_pages >= old_rel_pages)
	{
		/* can't do anything after all */
		UnlockRelation(onerel, AccessExclusiveLock);
		return;
	}

	/*
	 * Okay to truncate.
	 */
856
	RelationTruncate(onerel, new_rel_pages);
857

858 859 860 861 862 863 864
	/*
	 * Note: once we have truncated, we *must* keep the exclusive lock
	 * until commit.  The sinval message that will be sent at commit
	 * (as a result of vac_update_relstats()) must be received by other
	 * backends, to cause them to reset their rd_targblock values, before
	 * they can safely access the table again.
	 */
865

866 867 868 869
	/*
	 * Drop free-space info for removed blocks; these must not get entered
	 * into the FSM!
	 */
870
	pageSpaces = vacrelstats->free_pages;
871 872 873 874
	n = vacrelstats->num_free_pages;
	j = 0;
	for (i = 0; i < n; i++)
	{
875
		if (pageSpaces[i].blkno < new_rel_pages)
876
		{
877
			pageSpaces[j] = pageSpaces[i];
878 879 880 881
			j++;
		}
	}
	vacrelstats->num_free_pages = j;
Bruce Momjian's avatar
Bruce Momjian committed
882

883 884 885
	/*
	 * If tot_free_pages was more than num_free_pages, we can't tell for sure
	 * what its correct value is now, because we don't know which of the
Bruce Momjian's avatar
Bruce Momjian committed
886 887
	 * forgotten pages are getting truncated.  Conservatively set it equal to
	 * num_free_pages.
888 889 890
	 */
	vacrelstats->tot_free_pages = j;

891 892
	/* We destroyed the heap ordering, so mark array unordered */
	vacrelstats->fs_is_heap = false;
893

894 895 896 897
	/* update statistics */
	vacrelstats->rel_pages = new_rel_pages;
	vacrelstats->pages_removed = old_rel_pages - new_rel_pages;

898 899 900 901
	ereport(elevel,
			(errmsg("\"%s\": truncated %u to %u pages",
					RelationGetRelationName(onerel),
					old_rel_pages, new_rel_pages),
902 903
			 errdetail("%s.",
					   pg_rusage_show(&ru0))));
904 905 906
}

/*
907
 * Rescan end pages to verify that they are (still) empty of tuples.
908 909 910 911
 *
 * Returns number of nondeletable pages (last nonempty page + 1).
 */
static BlockNumber
912
count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
913 914 915 916 917 918 919 920 921 922 923
{
	BlockNumber blkno;

	/* Strange coding of loop control is needed because blkno is unsigned */
	blkno = vacrelstats->rel_pages;
	while (blkno > vacrelstats->nonempty_pages)
	{
		Buffer		buf;
		Page		page;
		OffsetNumber offnum,
					maxoff;
924
		bool		hastup;
925

926 927 928
		/*
		 * We don't insert a vacuum delay point here, because we have an
		 * exclusive lock on the table which we want to hold for as short
929 930
		 * a time as possible.  We still need to check for interrupts
		 * however.
931
		 */
932
		CHECK_FOR_INTERRUPTS();
Jan Wieck's avatar
Jan Wieck committed
933

934 935
		blkno--;

936
		buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
937 938 939 940 941 942 943 944

		/* In this phase we only need shared access to the buffer */
		LockBuffer(buf, BUFFER_LOCK_SHARE);

		page = BufferGetPage(buf);

		if (PageIsNew(page) || PageIsEmpty(page))
		{
945
			/* PageIsNew probably shouldn't happen... */
946
			UnlockReleaseBuffer(buf);
947 948 949 950 951 952 953 954 955 956 957 958 959
			continue;
		}

		hastup = false;
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
		{
			ItemId		itemid;

			itemid = PageGetItemId(page, offnum);

960 961 962 963 964 965 966
			/*
			 * Note: any non-unused item should be taken as a reason to keep
			 * this page.  We formerly thought that DEAD tuples could be
			 * thrown away, but that's not so, because we'd not have cleaned
			 * out their index entries.
			 */
			if (ItemIdIsUsed(itemid))
967 968 969 970
			{
				hastup = true;
				break;			/* can stop scanning */
			}
971
		}						/* scan along page */
972

973
		UnlockReleaseBuffer(buf);
974 975 976 977 978 979 980 981

		/* Done scanning if we found a tuple here */
		if (hastup)
			return blkno + 1;
	}

	/*
	 * If we fall out of the loop, all the previously-thought-to-be-empty
982
	 * pages still are; we need not bother to look at the last known-nonempty
983
	 * page.
984 985 986 987 988 989 990 991 992 993
	 */
	return vacrelstats->nonempty_pages;
}

/*
 * lazy_space_alloc - space allocation decisions for lazy vacuum
 *
 * See the comments at the head of this file for rationale.
 */
static void
994
lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
995
{
996
	long		maxtuples;
997 998
	int			maxpages;

999 1000
	if (vacrelstats->hasindex)
	{
1001 1002 1003 1004 1005
		maxtuples = (maintenance_work_mem * 1024L) / sizeof(ItemPointerData);
		maxtuples = Min(maxtuples, INT_MAX);
		maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
		/* stay sane if small maintenance_work_mem */
		maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
1006 1007 1008
	}
	else
	{
1009 1010
		maxtuples = MaxHeapTuplesPerPage;
	}
1011 1012

	vacrelstats->num_dead_tuples = 0;
1013
	vacrelstats->max_dead_tuples = (int) maxtuples;
1014 1015 1016 1017
	vacrelstats->dead_tuples = (ItemPointer)
		palloc(maxtuples * sizeof(ItemPointerData));

	maxpages = MaxFSMPages;
1018
	maxpages = Min(maxpages, MaxAllocSize / sizeof(PageFreeSpaceInfo));
1019 1020 1021 1022 1023 1024 1025
	/* No need to allocate more pages than the relation has blocks */
	if (relblocks < (BlockNumber) maxpages)
		maxpages = (int) relblocks;

	vacrelstats->fs_is_heap = false;
	vacrelstats->num_free_pages = 0;
	vacrelstats->max_free_pages = maxpages;
1026 1027
	vacrelstats->free_pages = (PageFreeSpaceInfo *)
		palloc(maxpages * sizeof(PageFreeSpaceInfo));
1028
	vacrelstats->tot_free_pages = 0;
1029 1030 1031 1032 1033 1034 1035 1036 1037 1038
}

/*
 * lazy_record_dead_tuple - remember one deletable tuple
 */
static void
lazy_record_dead_tuple(LVRelStats *vacrelstats,
					   ItemPointer itemptr)
{
	/*
1039
	 * The array shouldn't overflow under normal behavior, but perhaps it
1040
	 * could if we are given a really small maintenance_work_mem. In that
1041
	 * case, just forget the last few tuples (we'll get 'em next time).
1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
	 */
	if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
	{
		vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
		vacrelstats->num_dead_tuples++;
	}
}

/*
 * lazy_record_free_space - remember free space on one page
 */
static void
lazy_record_free_space(LVRelStats *vacrelstats,
					   BlockNumber page,
					   Size avail)
{
1058
	PageFreeSpaceInfo *pageSpaces;
1059 1060
	int			n;

1061 1062 1063
	/*
	 * A page with less than stats->threshold free space will be forgotten
	 * immediately, and never passed to the free space map.  Removing the
1064 1065 1066 1067
	 * uselessly small entries early saves cycles, and in particular reduces
	 * the amount of time we spend holding the FSM lock when we finally call
	 * RecordRelationFreeSpace.  Since the FSM will probably drop pages with
	 * little free space anyway, there's no point in making this really small.
1068
	 *
1069 1070 1071 1072 1073
	 * XXX Is it worth trying to measure average tuple size, and using that to
	 * adjust the threshold?  Would be worthwhile if FSM has no stats yet for
	 * this relation.  But changing the threshold as we scan the rel might
	 * lead to bizarre behavior, too.  Also, it's probably better if vacuum.c
	 * has the same thresholding behavior as we do here.
1074 1075
	 */
	if (avail < vacrelstats->threshold)
1076 1077
		return;

1078 1079 1080
	/* Count all pages over threshold, even if not enough space in array */
	vacrelstats->tot_free_pages++;

1081
	/* Copy pointers to local variables for notational simplicity */
1082
	pageSpaces = vacrelstats->free_pages;
1083 1084 1085 1086 1087
	n = vacrelstats->max_free_pages;

	/* If we haven't filled the array yet, just keep adding entries */
	if (vacrelstats->num_free_pages < n)
	{
1088 1089
		pageSpaces[vacrelstats->num_free_pages].blkno = page;
		pageSpaces[vacrelstats->num_free_pages].avail = avail;
1090 1091 1092 1093 1094 1095 1096
		vacrelstats->num_free_pages++;
		return;
	}

	/*----------
	 * The rest of this routine works with "heap" organization of the
	 * free space arrays, wherein we maintain the heap property
Bruce Momjian's avatar
Bruce Momjian committed
1097
	 *			avail[(j-1) div 2] <= avail[j]	for 0 < j < n.
1098 1099 1100 1101 1102 1103 1104 1105
	 * In particular, the zero'th element always has the smallest available
	 * space and can be discarded to make room for a new page with more space.
	 * See Knuth's discussion of heap-based priority queues, sec 5.2.3;
	 * but note he uses 1-origin array subscripts, not 0-origin.
	 *----------
	 */

	/* If we haven't yet converted the array to heap organization, do it */
1106
	if (!vacrelstats->fs_is_heap)
1107 1108 1109
	{
		/*
		 * Scan backwards through the array, "sift-up" each value into its
1110 1111
		 * correct position.  We can start the scan at n/2-1 since each entry
		 * above that position has no children to worry about.
1112
		 */
1113
		int			l = n / 2;
1114 1115 1116

		while (--l >= 0)
		{
1117 1118
			BlockNumber R = pageSpaces[l].blkno;
			Size		K = pageSpaces[l].avail;
1119 1120 1121 1122 1123
			int			i;		/* i is where the "hole" is */

			i = l;
			for (;;)
			{
1124
				int			j = 2 * i + 1;
1125 1126 1127

				if (j >= n)
					break;
1128
				if (j + 1 < n && pageSpaces[j].avail > pageSpaces[j + 1].avail)
1129
					j++;
1130
				if (K <= pageSpaces[j].avail)
1131
					break;
1132
				pageSpaces[i] = pageSpaces[j];
1133 1134
				i = j;
			}
1135 1136
			pageSpaces[i].blkno = R;
			pageSpaces[i].avail = K;
1137 1138 1139 1140 1141 1142
		}

		vacrelstats->fs_is_heap = true;
	}

	/* If new page has more than zero'th entry, insert it into heap */
1143
	if (avail > pageSpaces[0].avail)
1144 1145
	{
		/*
1146
		 * Notionally, we replace the zero'th entry with the new data, and
1147 1148 1149
		 * then sift-up to maintain the heap property.	Physically, the new
		 * data doesn't get stored into the arrays until we find the right
		 * location for it.
1150
		 */
1151
		int			i = 0;		/* i is where the "hole" is */
1152 1153 1154

		for (;;)
		{
1155
			int			j = 2 * i + 1;
1156 1157 1158

			if (j >= n)
				break;
1159
			if (j + 1 < n && pageSpaces[j].avail > pageSpaces[j + 1].avail)
1160
				j++;
1161
			if (avail <= pageSpaces[j].avail)
1162
				break;
1163
			pageSpaces[i] = pageSpaces[j];
1164 1165
			i = j;
		}
1166 1167
		pageSpaces[i].blkno = page;
		pageSpaces[i].avail = avail;
1168 1169 1170 1171 1172 1173
	}
}

/*
 *	lazy_tid_reaped() -- is a particular tid deletable?
 *
1174 1175
 *		This has the right signature to be an IndexBulkDeleteCallback.
 *
1176 1177 1178
 *		Assumes dead_tuples array is in sorted order.
 */
static bool
1179
lazy_tid_reaped(ItemPointer itemptr, void *state)
1180
{
1181
	LVRelStats *vacrelstats = (LVRelStats *) state;
1182
	ItemPointer res;
1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199

	res = (ItemPointer) bsearch((void *) itemptr,
								(void *) vacrelstats->dead_tuples,
								vacrelstats->num_dead_tuples,
								sizeof(ItemPointerData),
								vac_cmp_itemptr);

	return (res != NULL);
}

/*
 * Update the shared Free Space Map with the info we now have about
 * free space in the relation, discarding any old info the map may have.
 */
static void
lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats)
{
1200 1201 1202
	PageFreeSpaceInfo *pageSpaces = vacrelstats->free_pages;
	int			nPages = vacrelstats->num_free_pages;

1203
	/*
1204
	 * Sort data into order, as required by RecordRelationFreeSpace.
1205
	 */
1206 1207 1208 1209
	if (nPages > 1)
		qsort(pageSpaces, nPages, sizeof(PageFreeSpaceInfo),
			  vac_cmp_page_spaces);

1210 1211
	RecordRelationFreeSpace(&onerel->rd_node, vacrelstats->tot_free_pages,
							nPages, pageSpaces);
1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242
}

/*
 * Comparator routines for use with qsort() and bsearch().
 */
static int
vac_cmp_itemptr(const void *left, const void *right)
{
	BlockNumber lblk,
				rblk;
	OffsetNumber loff,
				roff;

	lblk = ItemPointerGetBlockNumber((ItemPointer) left);
	rblk = ItemPointerGetBlockNumber((ItemPointer) right);

	if (lblk < rblk)
		return -1;
	if (lblk > rblk)
		return 1;

	loff = ItemPointerGetOffsetNumber((ItemPointer) left);
	roff = ItemPointerGetOffsetNumber((ItemPointer) right);

	if (loff < roff)
		return -1;
	if (loff > roff)
		return 1;

	return 0;
}
1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255

static int
vac_cmp_page_spaces(const void *left, const void *right)
{
	PageFreeSpaceInfo *linfo = (PageFreeSpaceInfo *) left;
	PageFreeSpaceInfo *rinfo = (PageFreeSpaceInfo *) right;

	if (linfo->blkno < rinfo->blkno)
		return -1;
	else if (linfo->blkno > rinfo->blkno)
		return 1;
	return 0;
}