vacuum.c 75.6 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * vacuum.c
4
 *	  the postgres vacuum cleaner
5
 *
6
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
Bruce Momjian's avatar
Add:  
Bruce Momjian committed
7
 * Portions Copyright (c) 1994, Regents of the University of California
8 9 10
 *
 *
 * IDENTIFICATION
11
 *	  $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.201 2001/07/02 20:50:46 tgl Exp $
12 13 14
 *
 *-------------------------------------------------------------------------
 */
15 16
#include "postgres.h"

17 18 19 20
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <sys/time.h>
21
#include <sys/types.h>
22
#include <sys/file.h>
23
#include <sys/stat.h>
24

25 26 27 28 29
#ifndef HAVE_GETRUSAGE
#include "rusagestub.h"
#else
#include <sys/resource.h>
#endif
Bruce Momjian's avatar
Bruce Momjian committed
30 31 32

#include "access/genam.h"
#include "access/heapam.h"
33
#include "access/xlog.h"
Bruce Momjian's avatar
Bruce Momjian committed
34 35 36 37
#include "catalog/catalog.h"
#include "catalog/catname.h"
#include "catalog/index.h"
#include "commands/vacuum.h"
Bruce Momjian's avatar
Bruce Momjian committed
38
#include "miscadmin.h"
39
#include "nodes/execnodes.h"
40
#include "storage/freespace.h"
41
#include "storage/sinval.h"
Bruce Momjian's avatar
Bruce Momjian committed
42
#include "storage/smgr.h"
43
#include "tcop/tcopprot.h"
44
#include "utils/acl.h"
Bruce Momjian's avatar
Bruce Momjian committed
45
#include "utils/builtins.h"
46
#include "utils/fmgroids.h"
Bruce Momjian's avatar
Bruce Momjian committed
47
#include "utils/inval.h"
48
#include "utils/relcache.h"
Bruce Momjian's avatar
Bruce Momjian committed
49
#include "utils/syscache.h"
50
#include "utils/temprel.h"
Bruce Momjian's avatar
Bruce Momjian committed
51

52 53
#include "pgstat.h"

54 55 56 57 58
extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
			   char *unused, int unlen);
extern XLogRecPtr log_heap_move(Relation reln,
			  Buffer oldbuf, ItemPointerData from,
			  Buffer newbuf, HeapTuple newtup);
59

60 61 62 63 64 65 66 67 68 69 70 71 72 73 74

typedef struct VRelListData
{
	Oid			vrl_relid;
	struct VRelListData *vrl_next;
} VRelListData;

typedef VRelListData *VRelList;

typedef struct VacPageData
{
	BlockNumber blkno;			/* BlockNumber of this Page */
	Size		free;			/* FreeSpace on this Page */
	uint16		offsets_used;	/* Number of OffNums used by vacuum */
	uint16		offsets_free;	/* Number of OffNums free or to be free */
75
	OffsetNumber offsets[1];	/* Array of free OffNums */
76 77 78 79 80 81
} VacPageData;

typedef VacPageData *VacPage;

typedef struct VacPageListData
{
82 83
	BlockNumber	empty_end_pages;	/* Number of "empty" end-pages */
	int			num_pages;			/* Number of pages in pagedesc */
84 85
	int			num_allocated_pages;	/* Number of allocated pages in
										 * pagedesc */
86
	VacPage    *pagedesc;			/* Descriptions of pages */
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
} VacPageListData;

typedef VacPageListData *VacPageList;

typedef struct VTupleLinkData
{
	ItemPointerData new_tid;
	ItemPointerData this_tid;
} VTupleLinkData;

typedef VTupleLinkData *VTupleLink;

typedef struct VTupleMoveData
{
	ItemPointerData tid;		/* tuple ID */
	VacPage		vacpage;		/* where to move */
	bool		cleanVpd;		/* clean vacpage before using */
} VTupleMoveData;

typedef VTupleMoveData *VTupleMove;

typedef struct VRelStats
{
	Oid			relid;
111 112
	BlockNumber	rel_pages;
	double		rel_tuples;
113 114 115 116 117 118 119
	Size		min_tlen;
	Size		max_tlen;
	bool		hasindex;
	int			num_vtlinks;
	VTupleLink	vtlinks;
} VRelStats;

120 121 122 123 124
typedef struct VacRUsage
{
	struct timeval	tv;
	struct rusage	ru;
} VacRUsage;
125

126
static MemoryContext vac_context = NULL;
127

128
static int	MESSAGE_LEVEL;		/* message level */
129

Bruce Momjian's avatar
Bruce Momjian committed
130
static TransactionId XmaxRecent;
131

132

133
/* non-export function prototypes */
Bruce Momjian's avatar
Bruce Momjian committed
134 135
static void vacuum_init(void);
static void vacuum_shutdown(void);
136
static VRelList getrels(Name VacRelP, const char *stmttype);
137
static void vacuum_rel(Oid relid);
138 139 140 141 142 143 144
static void scan_heap(VRelStats *vacrelstats, Relation onerel,
					  VacPageList vacuum_pages, VacPageList fraged_pages);
static void repair_frag(VRelStats *vacrelstats, Relation onerel,
						VacPageList vacuum_pages, VacPageList fraged_pages,
						int nindices, Relation *Irel);
static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
						VacPageList vacpagelist);
145
static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage);
146
static void vacuum_index(VacPageList vacpagelist, Relation indrel,
147 148
						 double num_tuples, int keep_tuples);
static void scan_index(Relation indrel, double num_tuples);
Bruce Momjian's avatar
Bruce Momjian committed
149
static VacPage tid_reaped(ItemPointer itemptr, VacPageList vacpagelist);
150 151
static void vac_update_fsm(Relation onerel, VacPageList fraged_pages,
						   BlockNumber rel_pages);
152
static VacPage copy_vac_page(VacPage vacpage);
Bruce Momjian's avatar
Bruce Momjian committed
153
static void vpage_insert(VacPageList vacpagelist, VacPage vpnew);
154
static void get_indices(Relation relation, int *nindices, Relation **Irel);
Bruce Momjian's avatar
Bruce Momjian committed
155
static void close_indices(int nindices, Relation *Irel);
156
static IndexInfo **get_index_desc(Relation onerel, int nindices,
157
			   Relation *Irel);
158 159 160
static void *vac_bsearch(const void *key, const void *base,
						 size_t nelem, size_t size,
						 int (*compar) (const void *, const void *));
Bruce Momjian's avatar
Bruce Momjian committed
161 162 163
static int	vac_cmp_blk(const void *left, const void *right);
static int	vac_cmp_offno(const void *left, const void *right);
static int	vac_cmp_vtlinks(const void *left, const void *right);
Bruce Momjian's avatar
Bruce Momjian committed
164
static bool enough_space(VacPage vacpage, Size len);
165 166
static void init_rusage(VacRUsage *ru0);
static char *show_rusage(VacRUsage *ru0);
167

168

169 170 171
/*
 * Primary entry point for VACUUM and ANALYZE commands.
 */
172
void
173
vacuum(VacuumStmt *vacstmt)
174
{
175
	const char *stmttype = vacstmt->vacuum ? "VACUUM" : "ANALYZE";
176
	NameData	VacRel;
177
	Name		VacRelName;
178 179
	VRelList	vrl,
				cur;
180

181
	/*
182
	 * We cannot run VACUUM inside a user transaction block; if we were
Bruce Momjian's avatar
Bruce Momjian committed
183 184 185 186 187 188
	 * inside a transaction, then our commit- and
	 * start-transaction-command calls would not have the intended effect!
	 * Furthermore, the forced commit that occurs before truncating the
	 * relation's file would have the effect of committing the rest of the
	 * user's transaction too, which would certainly not be the desired
	 * behavior.
189
	 */
190
	if (IsTransactionBlock())
191
		elog(ERROR, "%s cannot run inside a BEGIN/END block", stmttype);
192

193 194 195 196 197
	/*
	 * Send info about dead objects to the statistics collector
	 */
	pgstat_vacuum_tabstat();

198
	if (vacstmt->verbose)
199 200 201 202
		MESSAGE_LEVEL = NOTICE;
	else
		MESSAGE_LEVEL = DEBUG;

203 204 205
	/*
	 * Create special memory context for cross-transaction storage.
	 *
206 207 208
	 * Since it is a child of QueryContext, it will go away eventually even
	 * if we suffer an error; there's no need for special abort cleanup
	 * logic.
209 210 211 212 213 214
	 */
	vac_context = AllocSetContextCreate(QueryContext,
										"Vacuum",
										ALLOCSET_DEFAULT_MINSIZE,
										ALLOCSET_DEFAULT_INITSIZE,
										ALLOCSET_DEFAULT_MAXSIZE);
215

216 217
	/* Convert vacrel, which is just a string, to a Name */
	if (vacstmt->vacrel)
218
	{
219
		namestrcpy(&VacRel, vacstmt->vacrel);
220 221 222 223
		VacRelName = &VacRel;
	}
	else
		VacRelName = NULL;
224

225 226
	/* Build list of relations to process (note this lives in vac_context) */
	vrl = getrels(VacRelName, stmttype);
227

228 229 230
	/*
	 * Start up the vacuum cleaner.
	 */
Bruce Momjian's avatar
Bruce Momjian committed
231
	vacuum_init();
232

233 234 235 236 237 238 239 240 241 242 243 244 245
	/*
	 * Process each selected relation.  We are careful to process
	 * each relation in a separate transaction in order to avoid holding
	 * too many locks at one time.
	 */
	for (cur = vrl; cur != (VRelList) NULL; cur = cur->vrl_next)
	{
		if (vacstmt->vacuum)
			vacuum_rel(cur->vrl_relid);
		/* analyze separately so locking is minimized */
		if (vacstmt->analyze)
			analyze_rel(cur->vrl_relid, vacstmt);
	}
246 247

	/* clean up */
Bruce Momjian's avatar
Bruce Momjian committed
248
	vacuum_shutdown();
249 250 251
}

/*
Bruce Momjian's avatar
Bruce Momjian committed
252
 *	vacuum_init(), vacuum_shutdown() -- start up and shut down the vacuum cleaner.
253
 *
254 255 256 257 258 259 260
 *		Formerly, there was code here to prevent more than one VACUUM from
 *		executing concurrently in the same database.  However, there's no
 *		good reason to prevent that, and manually removing lockfiles after
 *		a vacuum crash was a pain for dbadmins.  So, forget about lockfiles,
 *		and just rely on the exclusive lock we grab on each target table
 *		to ensure that there aren't two VACUUMs running on the same table
 *		at the same time.
261
 *
262 263
 *		The strangeness with committing and starting transactions in the
 *		init and shutdown routines is due to the fact that the vacuum cleaner
264
 *		is invoked via an SQL command, and so is already executing inside
265 266
 *		a transaction.	We need to leave ourselves in a predictable state
 *		on entry and exit to the vacuum cleaner.  We commit the transaction
Bruce Momjian's avatar
Bruce Momjian committed
267 268
 *		started in PostgresMain() inside vacuum_init(), and start one in
 *		vacuum_shutdown() to match the commit waiting for us back in
269
 *		PostgresMain().
270 271
 */
static void
272
vacuum_init(void)
273
{
274 275
	/* matches the StartTransaction in PostgresMain() */
	CommitTransactionCommand();
276 277 278
}

static void
279
vacuum_shutdown(void)
280
{
281 282
	/* on entry, we are not in a transaction */

Bruce Momjian's avatar
Bruce Momjian committed
283 284 285 286 287 288
	/*
	 * Flush the init file that relcache.c uses to save startup time. The
	 * next backend startup will rebuild the init file with up-to-date
	 * information from pg_class.  This lets the optimizer see the stats
	 * that we've collected for certain critical system indexes.  See
	 * relcache.c for more details.
289
	 *
Bruce Momjian's avatar
Bruce Momjian committed
290 291
	 * Ignore any failure to unlink the file, since it might not be there if
	 * no backend has been started since the last vacuum...
292 293 294
	 */
	unlink(RELCACHE_INIT_FILENAME);

295 296
	/* matches the CommitTransaction in PostgresMain() */
	StartTransactionCommand();
297 298 299

	/*
	 * Clean up working storage --- note we must do this after
300 301
	 * StartTransactionCommand, else we might be trying to delete the
	 * active context!
302 303 304
	 */
	MemoryContextDelete(vac_context);
	vac_context = NULL;
305 306 307
}

/*
308
 * Build a list of VRelListData nodes for each relation to be processed
309
 */
310
static VRelList
311
getrels(Name VacRelP, const char *stmttype)
312
{
313 314 315 316
	Relation	rel;
	TupleDesc	tupdesc;
	HeapScanDesc scan;
	HeapTuple	tuple;
317 318 319 320 321 322
	VRelList	vrl,
				cur;
	Datum		d;
	char	   *rname;
	char		rkind;
	bool		n;
323
	ScanKeyData key;
324

325
	if (VacRelP)
326
	{
327

Bruce Momjian's avatar
Bruce Momjian committed
328 329 330 331
		/*
		 * we could use the cache here, but it is clearer to use scankeys
		 * for both vacuum cases, bjm 2000/01/19
		 */
332
		char	   *nontemp_relname;
333 334

		/* We must re-map temp table names bjm 2000-04-06 */
335 336
		nontemp_relname = get_temp_rel_by_username(NameStr(*VacRelP));
		if (nontemp_relname == NULL)
337 338
			nontemp_relname = NameStr(*VacRelP);

339
		ScanKeyEntryInitialize(&key, 0x0, Anum_pg_class_relname,
Bruce Momjian's avatar
Bruce Momjian committed
340
							   F_NAMEEQ,
341
							   PointerGetDatum(nontemp_relname));
342 343 344
	}
	else
	{
345
		/* find all relations listed in pg_class */
346
		ScanKeyEntryInitialize(&key, 0x0, Anum_pg_class_relkind,
347
							   F_CHAREQ, CharGetDatum('r'));
348
	}
349

350
	vrl = cur = (VRelList) NULL;
351

352
	rel = heap_openr(RelationRelationName, AccessShareLock);
353
	tupdesc = RelationGetDescr(rel);
354

355
	scan = heap_beginscan(rel, false, SnapshotNow, 1, &key);
356

357
	while (HeapTupleIsValid(tuple = heap_getnext(scan, 0)))
358
	{
359
		d = heap_getattr(tuple, Anum_pg_class_relname, tupdesc, &n);
360
		rname = (char *) DatumGetName(d);
361

362
		d = heap_getattr(tuple, Anum_pg_class_relkind, tupdesc, &n);
363 364
		rkind = DatumGetChar(d);

Bruce Momjian's avatar
Bruce Momjian committed
365
		if (rkind != RELKIND_RELATION)
366
		{
367 368
			elog(NOTICE, "%s: can not process indexes, views or special system tables",
				 stmttype);
369 370
			continue;
		}
371

372
		/* Make a relation list entry for this guy */
373
		if (vrl == (VRelList) NULL)
374 375
			vrl = cur = (VRelList)
				MemoryContextAlloc(vac_context, sizeof(VRelListData));
376 377
		else
		{
378 379
			cur->vrl_next = (VRelList)
				MemoryContextAlloc(vac_context, sizeof(VRelListData));
380 381
			cur = cur->vrl_next;
		}
382

383
		cur->vrl_relid = tuple->t_data->t_oid;
384 385 386
		cur->vrl_next = (VRelList) NULL;
	}

387
	heap_endscan(scan);
388
	heap_close(rel, AccessShareLock);
389

390 391
	if (vrl == NULL)
		elog(NOTICE, "%s: table not found", stmttype);
392

393
	return vrl;
394 395 396
}

/*
Bruce Momjian's avatar
Bruce Momjian committed
397
 *	vacuum_rel() -- vacuum one heap relation
398
 *
399
 *		This routine vacuums a single heap, cleans out its indices, and
400
 *		updates its num_pages and num_tuples statistics.
401
 *
402 403 404 405 406
 *		Doing one heap at a time incurs extra overhead, since we need to
 *		check that the heap exists again just before we vacuum it.	The
 *		reason that we do this is so that vacuuming can be spread across
 *		many small transactions.  Otherwise, two-phase locking would require
 *		us to lock the entire database during one pass of the vacuum cleaner.
407 408
 *
 *		At entry and exit, we are not inside a transaction.
409 410
 */
static void
411
vacuum_rel(Oid relid)
412
{
413
	Relation	onerel;
414
	LockRelId	onerelid;
415 416 417 418
	VacPageListData vacuum_pages;		/* List of pages to vacuum and/or
										 * clean indices */
	VacPageListData fraged_pages;		/* List of pages with space enough
										 * for re-using */
419 420 421 422
	Relation   *Irel;
	int32		nindices,
				i;
	VRelStats  *vacrelstats;
423
	bool		reindex = false;
424
	Oid			toast_relid;
425

426 427
	/* Begin a transaction for vacuuming this relation */
	StartTransactionCommand();
428

429
	/*
Bruce Momjian's avatar
Bruce Momjian committed
430 431
	 * Check for user-requested abort.	Note we want this to be inside a
	 * transaction, so xact.c doesn't issue useless NOTICE.
432
	 */
433
	CHECK_FOR_INTERRUPTS();
434

435 436 437 438
	/*
	 * Race condition -- if the pg_class tuple has gone away since the
	 * last time we saw it, we don't need to vacuum it.
	 */
439 440 441
	if (!SearchSysCacheExists(RELOID,
							  ObjectIdGetDatum(relid),
							  0, 0, 0))
442
	{
443
		CommitTransactionCommand();
444 445 446
		return;
	}

447
	/*
448 449
	 * Open the class, get an exclusive lock on it, and check permissions.
	 *
450 451 452 453
	 * We allow the user to vacuum a table if he is superuser, the table
	 * owner, or the database owner (but in the latter case, only if it's
	 * not a shared relation).  pg_ownercheck includes the superuser case.
	 *
454 455
	 * Note we choose to treat permissions failure as a NOTICE and keep
	 * trying to vacuum the rest of the DB --- is this appropriate?
456 457
	 */
	onerel = heap_open(relid, AccessExclusiveLock);
458

459 460 461
	if (! (pg_ownercheck(GetUserId(), RelationGetRelationName(onerel),
						 RELNAME) ||
		   (is_dbadmin(MyDatabaseId) && !onerel->rd_rel->relisshared)))
462
	{
463
		elog(NOTICE, "Skipping \"%s\" --- only table or database owner can VACUUM it",
464 465
			 RelationGetRelationName(onerel));
		heap_close(onerel, AccessExclusiveLock);
466
		CommitTransactionCommand();
467 468 469
		return;
	}

470
	/*
471 472 473 474
	 * Get a session-level exclusive lock too.	This will protect our
	 * exclusive access to the relation across multiple transactions, so
	 * that we can vacuum the relation's TOAST table (if any) secure in
	 * the knowledge that no one is diddling the parent relation.
475 476 477 478 479 480 481 482 483 484
	 *
	 * NOTE: this cannot block, even if someone else is waiting for access,
	 * because the lock manager knows that both lock requests are from the
	 * same process.
	 */
	onerelid = onerel->rd_lockInfo.lockRelId;
	LockRelationForSession(&onerelid, AccessExclusiveLock);

	/*
	 * Remember the relation's TOAST relation for later
485 486 487
	 */
	toast_relid = onerel->rd_rel->reltoastrelid;

488 489 490
	/*
	 * Set up statistics-gathering machinery.
	 */
491 492
	vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
	vacrelstats->relid = relid;
493 494
	vacrelstats->rel_pages = 0;
	vacrelstats->rel_tuples = 0;
495
	vacrelstats->hasindex = false;
Bruce Momjian's avatar
Bruce Momjian committed
496

497 498
	GetXmaxRecent(&XmaxRecent);

499
	/* scan it */
Hiroshi Inoue's avatar
Hiroshi Inoue committed
500
	reindex = false;
Bruce Momjian's avatar
Bruce Momjian committed
501
	vacuum_pages.num_pages = fraged_pages.num_pages = 0;
Bruce Momjian's avatar
Bruce Momjian committed
502
	scan_heap(vacrelstats, onerel, &vacuum_pages, &fraged_pages);
503 504
	if (IsIgnoringSystemIndexes() &&
		IsSystemRelationName(RelationGetRelationName(onerel)))
Hiroshi Inoue's avatar
Hiroshi Inoue committed
505
		reindex = true;
506 507

	/* Now open indices */
Hiroshi Inoue's avatar
Hiroshi Inoue committed
508
	nindices = 0;
509
	Irel = (Relation *) NULL;
510
	get_indices(onerel, &nindices, &Irel);
Hiroshi Inoue's avatar
Hiroshi Inoue committed
511 512 513 514
	if (!Irel)
		reindex = false;
	else if (!RelationGetForm(onerel)->relhasindex)
		reindex = true;
515 516 517 518
	if (nindices > 0)
		vacrelstats->hasindex = true;
	else
		vacrelstats->hasindex = false;
519

520
#ifdef NOT_USED
521
	/*
522 523
	 * reindex in VACUUM is dangerous under WAL. ifdef out until it
	 * becomes safe.
524
	 */
Hiroshi Inoue's avatar
Hiroshi Inoue committed
525 526 527 528 529 530 531
	if (reindex)
	{
		for (i = 0; i < nindices; i++)
			index_close(Irel[i]);
		Irel = (Relation *) NULL;
		activate_indexes_of_a_table(relid, false);
	}
532
#endif	 /* NOT_USED */
533 534 535 536

	/* Clean/scan index relation(s) */
	if (Irel != (Relation *) NULL)
	{
Bruce Momjian's avatar
Bruce Momjian committed
537
		if (vacuum_pages.num_pages > 0)
538 539
		{
			for (i = 0; i < nindices; i++)
540
				vacuum_index(&vacuum_pages, Irel[i],
541
							 vacrelstats->rel_tuples, 0);
542 543 544
		}
		else
		{
545
			/* just scan indices to update statistic */
546
			for (i = 0; i < nindices; i++)
547
				scan_index(Irel[i], vacrelstats->rel_tuples);
548 549 550
		}
	}

551 552 553 554 555 556
	if (fraged_pages.num_pages > 0)
	{
		/* Try to shrink heap */
		repair_frag(vacrelstats, onerel, &vacuum_pages, &fraged_pages,
					nindices, Irel);
	}
557 558 559
	else
	{
		if (Irel != (Relation *) NULL)
Bruce Momjian's avatar
Bruce Momjian committed
560
			close_indices(nindices, Irel);
561 562 563
		if (vacuum_pages.num_pages > 0)
		{
			/* Clean pages from vacuum_pages list */
Bruce Momjian's avatar
Bruce Momjian committed
564
			vacuum_heap(vacrelstats, onerel, &vacuum_pages);
565 566 567 568 569 570 571 572 573
		}
		else
		{
			/*
			 * Flush dirty pages out to disk.  We must do this even if we
			 * didn't do anything else, because we want to ensure that all
			 * tuples have correct on-row commit status on disk (see
			 * bufmgr.c's comments for FlushRelationBuffers()).
			 */
574
			i = FlushRelationBuffers(onerel, vacrelstats->rel_pages);
575 576 577 578
			if (i < 0)
				elog(ERROR, "VACUUM (vacuum_rel): FlushRelationBuffers returned %d",
					 i);
		}
579
	}
580
#ifdef NOT_USED
Hiroshi Inoue's avatar
Hiroshi Inoue committed
581 582
	if (reindex)
		activate_indexes_of_a_table(relid, true);
583
#endif	 /* NOT_USED */
584

585 586 587
	/* update shared free space map with final free space info */
	vac_update_fsm(onerel, &fraged_pages, vacrelstats->rel_pages);

588 589 590
	/* all done with this class, but hold lock until commit */
	heap_close(onerel, NoLock);

591
	/* update statistics in pg_class */
592 593
	vac_update_relstats(vacrelstats->relid, vacrelstats->rel_pages,
						vacrelstats->rel_tuples, vacrelstats->hasindex);
594

595 596 597 598 599
	/*
	 * Complete the transaction and free all temporary memory used.
	 */
	CommitTransactionCommand();

600
	/*
601 602 603 604 605
	 * If the relation has a secondary toast one, vacuum that too while we
	 * still hold the session lock on the master table. We don't need to
	 * propagate "analyze" to it, because the toaster always uses
	 * hardcoded index access and statistics are totally unimportant for
	 * toast relations
606 607
	 */
	if (toast_relid != InvalidOid)
608
		vacuum_rel(toast_relid);
609

610 611 612 613
	/*
	 * Now release the session-level lock on the master table.
	 */
	UnlockRelationForSession(&onerelid, AccessExclusiveLock);
614 615 616
}

/*
Bruce Momjian's avatar
Bruce Momjian committed
617
 *	scan_heap() -- scan an open heap relation
618
 *
619 620 621 622
 *		This routine sets commit status bits, constructs vacuum_pages (list
 *		of pages we need to compact free space on and/or clean indexes of
 *		deleted tuples), constructs fraged_pages (list of pages with free
 *		space that tuples could be moved into), and calculates statistics
623
 *		on the number of live tuples in a heap.
624 625
 */
static void
Bruce Momjian's avatar
Bruce Momjian committed
626
scan_heap(VRelStats *vacrelstats, Relation onerel,
627
		  VacPageList vacuum_pages, VacPageList fraged_pages)
628
{
629
	BlockNumber nblocks,
630 631 632
				blkno;
	ItemId		itemid;
	Buffer		buf;
Bruce Momjian's avatar
Bruce Momjian committed
633
	HeapTupleData tuple;
634 635 636 637 638 639
	OffsetNumber offnum,
				maxoff;
	bool		pgchanged,
				tupgone,
				notup;
	char	   *relname;
Bruce Momjian's avatar
Bruce Momjian committed
640
	VacPage		vacpage,
641 642
				vacpagecopy;
	BlockNumber	empty_pages,
643 644
				new_pages,
				changed_pages,
Bruce Momjian's avatar
Bruce Momjian committed
645
				empty_end_pages;
646 647 648 649 650 651
	double		num_tuples,
				tups_vacuumed,
				nkeep,
				nunused,
				ncrash;
	double		free_size,
Bruce Momjian's avatar
Bruce Momjian committed
652
				usable_free_size;
653
	Size		min_tlen = MaxTupleSize;
654
	Size		max_tlen = 0;
655
	int			i;
656
	bool		do_shrinking = true;
657 658 659
	VTupleLink	vtlinks = (VTupleLink) palloc(100 * sizeof(VTupleLinkData));
	int			num_vtlinks = 0;
	int			free_vtlinks = 100;
660
	VacRUsage	ru0;
661

662
	init_rusage(&ru0);
663

664
	relname = RelationGetRelationName(onerel);
665 666
	elog(MESSAGE_LEVEL, "--Relation %s--", relname);

667 668 669
	empty_pages = new_pages = changed_pages = empty_end_pages = 0;
	num_tuples = tups_vacuumed = nkeep = nunused = ncrash = 0;
	free_size = 0;
670 671 672

	nblocks = RelationGetNumberOfBlocks(onerel);

673 674 675 676
	/*
	 * We initially create each VacPage item in a maximal-sized workspace,
	 * then copy the workspace into a just-large-enough copy.
	 */
Bruce Momjian's avatar
Bruce Momjian committed
677
	vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
678 679 680

	for (blkno = 0; blkno < nblocks; blkno++)
	{
681 682 683 684 685
		Page		page,
					tempPage = NULL;
		bool		do_reap,
					do_frag;

686 687
		buf = ReadBuffer(onerel, blkno);
		page = BufferGetPage(buf);
688

Bruce Momjian's avatar
Bruce Momjian committed
689
		vacpage->blkno = blkno;
690
		vacpage->offsets_used = 0;
Bruce Momjian's avatar
Bruce Momjian committed
691
		vacpage->offsets_free = 0;
692

693 694 695 696 697
		if (PageIsNew(page))
		{
			elog(NOTICE, "Rel %s: Uninitialized page %u - fixing",
				 relname, blkno);
			PageInit(page, BufferGetPageSize(buf), 0);
Bruce Momjian's avatar
Bruce Momjian committed
698 699
			vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
			free_size += (vacpage->free - sizeof(ItemIdData));
700
			new_pages++;
Bruce Momjian's avatar
Bruce Momjian committed
701
			empty_end_pages++;
702 703 704
			vacpagecopy = copy_vac_page(vacpage);
			vpage_insert(vacuum_pages, vacpagecopy);
			vpage_insert(fraged_pages, vacpagecopy);
705 706
			WriteBuffer(buf);
			continue;
707
		}
708 709

		if (PageIsEmpty(page))
710
		{
Bruce Momjian's avatar
Bruce Momjian committed
711 712
			vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
			free_size += (vacpage->free - sizeof(ItemIdData));
713
			empty_pages++;
Bruce Momjian's avatar
Bruce Momjian committed
714
			empty_end_pages++;
715 716 717
			vacpagecopy = copy_vac_page(vacpage);
			vpage_insert(vacuum_pages, vacpagecopy);
			vpage_insert(fraged_pages, vacpagecopy);
718 719
			ReleaseBuffer(buf);
			continue;
720 721
		}

722 723 724 725 726 727
		pgchanged = false;
		notup = true;
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
728
		{
729 730 731 732 733 734 735 736
			itemid = PageGetItemId(page, offnum);

			/*
			 * Collect un-used items too - it's possible to have indices
			 * pointing here after crash.
			 */
			if (!ItemIdIsUsed(itemid))
			{
Bruce Momjian's avatar
Bruce Momjian committed
737
				vacpage->offsets[vacpage->offsets_free++] = offnum;
738
				nunused += 1;
739 740 741
				continue;
			}

742
			tuple.t_datamcxt = NULL;
743 744 745
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple.t_len = ItemIdGetLength(itemid);
			ItemPointerSet(&(tuple.t_self), blkno, offnum);
746 747
			tupgone = false;

748
			if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
749
			{
750
				if (tuple.t_data->t_infomask & HEAP_XMIN_INVALID)
751
					tupgone = true;
752 753 754
				else if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
				{
					if (TransactionIdDidCommit((TransactionId)
Bruce Momjian's avatar
Bruce Momjian committed
755
											   tuple.t_data->t_cmin))
756 757
					{
						tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
758
						pgchanged = true;
759 760 761 762 763 764 765 766 767 768 769 770 771 772
						tupgone = true;
					}
					else
					{
						tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
						pgchanged = true;
					}
				}
				else if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
				{
					if (!TransactionIdDidCommit((TransactionId)
												tuple.t_data->t_cmin))
					{
						tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
773
						pgchanged = true;
774 775 776 777 778 779 780 781
						tupgone = true;
					}
					else
					{
						tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
						pgchanged = true;
					}
				}
782 783
				else
				{
784
					if (TransactionIdDidAbort(tuple.t_data->t_xmin))
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
785
						tupgone = true;
786
					else if (TransactionIdDidCommit(tuple.t_data->t_xmin))
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
787
					{
788
						tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
789 790
						pgchanged = true;
					}
791
					else if (!TransactionIdIsInProgress(tuple.t_data->t_xmin))
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
792 793
					{
						/*
794
						 * Not Aborted, Not Committed, Not in Progress -
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
795 796
						 * so it's from crashed process. - vadim 11/26/96
						 */
797
						ncrash += 1;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
798 799 800 801 802
						tupgone = true;
					}
					else
					{
						elog(NOTICE, "Rel %s: TID %u/%u: InsertTransactionInProgress %u - can't shrink relation",
Bruce Momjian's avatar
Bruce Momjian committed
803
						   relname, blkno, offnum, tuple.t_data->t_xmin);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
804 805
						do_shrinking = false;
					}
806 807 808
				}
			}

809 810 811
			/*
			 * here we are concerned about tuples with xmin committed and
			 * xmax unknown or committed
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
812
			 */
813 814
			if (tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED &&
				!(tuple.t_data->t_infomask & HEAP_XMAX_INVALID))
815
			{
816
				if (tuple.t_data->t_infomask & HEAP_XMAX_COMMITTED)
817 818 819 820
				{
					if (tuple.t_data->t_infomask & HEAP_MARKED_FOR_UPDATE)
					{
						tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
821 822 823
						tuple.t_data->t_infomask &=
							~(HEAP_XMAX_COMMITTED | HEAP_MARKED_FOR_UPDATE);
						pgchanged = true;
824 825 826 827
					}
					else
						tupgone = true;
				}
828
				else if (TransactionIdDidAbort(tuple.t_data->t_xmax))
829
				{
830
					tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
831 832
					pgchanged = true;
				}
833
				else if (TransactionIdDidCommit(tuple.t_data->t_xmax))
834 835 836 837
				{
					if (tuple.t_data->t_infomask & HEAP_MARKED_FOR_UPDATE)
					{
						tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
838 839
						tuple.t_data->t_infomask &=
							~(HEAP_XMAX_COMMITTED | HEAP_MARKED_FOR_UPDATE);
840 841 842 843 844
						pgchanged = true;
					}
					else
						tupgone = true;
				}
845
				else if (!TransactionIdIsInProgress(tuple.t_data->t_xmax))
846 847 848 849 850
				{
					/*
					 * Not Aborted, Not Committed, Not in Progress - so it
					 * from crashed process. - vadim 06/02/97
					 */
851
					tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
852 853
					tuple.t_data->t_infomask &=
						~(HEAP_XMAX_COMMITTED | HEAP_MARKED_FOR_UPDATE);
854 855 856 857 858
					pgchanged = true;
				}
				else
				{
					elog(NOTICE, "Rel %s: TID %u/%u: DeleteTransactionInProgress %u - can't shrink relation",
859
						 relname, blkno, offnum, tuple.t_data->t_xmax);
860 861
					do_shrinking = false;
				}
Bruce Momjian's avatar
Bruce Momjian committed
862

863
				/*
Bruce Momjian's avatar
Bruce Momjian committed
864 865
				 * If tuple is recently deleted then we must not remove it
				 * from relation.
866
				 */
867 868 869
				if (tupgone &&
					(tuple.t_data->t_infomask & HEAP_XMIN_INVALID) == 0 &&
					tuple.t_data->t_xmax >= XmaxRecent)
870 871
				{
					tupgone = false;
872
					nkeep += 1;
873 874 875 876 877
					if (!(tuple.t_data->t_infomask & HEAP_XMAX_COMMITTED))
					{
						tuple.t_data->t_infomask |= HEAP_XMAX_COMMITTED;
						pgchanged = true;
					}
Bruce Momjian's avatar
Bruce Momjian committed
878

879 880 881 882 883
					/*
					 * If we do shrinking and this tuple is updated one
					 * then remember it to construct updated tuple
					 * dependencies.
					 */
Bruce Momjian's avatar
Bruce Momjian committed
884 885
					if (do_shrinking && !(ItemPointerEquals(&(tuple.t_self),
											   &(tuple.t_data->t_ctid))))
886 887 888 889
					{
						if (free_vtlinks == 0)
						{
							free_vtlinks = 1000;
Bruce Momjian's avatar
Bruce Momjian committed
890 891 892
							vtlinks = (VTupleLink) repalloc(vtlinks,
										   (free_vtlinks + num_vtlinks) *
												 sizeof(VTupleLinkData));
893 894 895 896 897 898 899
						}
						vtlinks[num_vtlinks].new_tid = tuple.t_data->t_ctid;
						vtlinks[num_vtlinks].this_tid = tuple.t_self;
						free_vtlinks--;
						num_vtlinks++;
					}
				}
900 901 902 903 904
			}

			/*
			 * Other checks...
			 */
905
			if (!OidIsValid(tuple.t_data->t_oid))
906 907 908 909 910 911 912
			{
				elog(NOTICE, "Rel %s: TID %u/%u: OID IS INVALID. TUPGONE %d.",
					 relname, blkno, offnum, tupgone);
			}

			if (tupgone)
			{
913
				ItemId		lpp;
914

915 916 917 918 919
				/*
				 * Here we are building a temporary copy of the page with
				 * dead tuples removed.  Below we will apply
				 * PageRepairFragmentation to the copy, so that we can
				 * determine how much space will be available after
920
				 * removal of dead tuples.	But note we are NOT changing
921 922
				 * the real page yet...
				 */
923 924
				if (tempPage == (Page) NULL)
				{
925
					Size		pageSize;
926 927 928

					pageSize = PageGetPageSize(page);
					tempPage = (Page) palloc(pageSize);
929
					memcpy(tempPage, page, pageSize);
930 931
				}

932
				/* mark it unused on the temp page */
933 934 935
				lpp = &(((PageHeader) tempPage)->pd_linp[offnum - 1]);
				lpp->lp_flags &= ~LP_USED;

Bruce Momjian's avatar
Bruce Momjian committed
936
				vacpage->offsets[vacpage->offsets_free++] = offnum;
937
				tups_vacuumed += 1;
938 939 940
			}
			else
			{
941
				num_tuples += 1;
942
				notup = false;
943 944 945 946
				if (tuple.t_len < min_tlen)
					min_tlen = tuple.t_len;
				if (tuple.t_len > max_tlen)
					max_tlen = tuple.t_len;
947
			}
948
		}
949

Bruce Momjian's avatar
Bruce Momjian committed
950
		if (tempPage != (Page) NULL)
951 952
		{
			/* Some tuples are removable; figure free space after removal */
953
			PageRepairFragmentation(tempPage, NULL);
Bruce Momjian's avatar
Bruce Momjian committed
954
			vacpage->free = ((PageHeader) tempPage)->pd_upper - ((PageHeader) tempPage)->pd_lower;
955
			pfree(tempPage);
956
			do_reap = true;
957
		}
958 959 960
		else
		{
			/* Just use current available space */
Bruce Momjian's avatar
Bruce Momjian committed
961
			vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
962 963
			/* Need to reap the page if it has ~LP_USED line pointers */
			do_reap = (vacpage->offsets_free > 0);
964
		}
965

966 967 968 969 970 971 972 973 974 975
		free_size += vacpage->free;
		/*
		 * Add the page to fraged_pages if it has a useful amount of free
		 * space.  "Useful" means enough for a minimal-sized tuple.
		 * But we don't know that accurately near the start of the relation,
		 * so add pages unconditionally if they have >= BLCKSZ/10 free space.
		 */
		do_frag = (vacpage->free >= min_tlen || vacpage->free >= BLCKSZ/10);

		if (do_reap || do_frag)
976
		{
977 978 979 980 981
			vacpagecopy = copy_vac_page(vacpage);
			if (do_reap)
				vpage_insert(vacuum_pages, vacpagecopy);
			if (do_frag)
				vpage_insert(fraged_pages, vacpagecopy);
982 983
		}

984
		if (notup)
Bruce Momjian's avatar
Bruce Momjian committed
985
			empty_end_pages++;
986
		else
Bruce Momjian's avatar
Bruce Momjian committed
987
			empty_end_pages = 0;
988 989 990 991 992 993 994 995

		if (pgchanged)
		{
			WriteBuffer(buf);
			changed_pages++;
		}
		else
			ReleaseBuffer(buf);
996 997
	}

Bruce Momjian's avatar
Bruce Momjian committed
998
	pfree(vacpage);
999 1000

	/* save stats in the rel list for use later */
1001 1002
	vacrelstats->rel_tuples = num_tuples;
	vacrelstats->rel_pages = nblocks;
1003
	if (num_tuples == 0)
1004 1005 1006 1007
		min_tlen = max_tlen = 0;
	vacrelstats->min_tlen = min_tlen;
	vacrelstats->max_tlen = max_tlen;

Bruce Momjian's avatar
Bruce Momjian committed
1008 1009
	vacuum_pages->empty_end_pages = empty_end_pages;
	fraged_pages->empty_end_pages = empty_end_pages;
1010 1011

	/*
1012 1013 1014
	 * Clear the fraged_pages list if we found we couldn't shrink.
	 * Else, remove any "empty" end-pages from the list, and compute
	 * usable free space = free space in remaining pages.
1015
	 */
1016
	if (do_shrinking)
1017
	{
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027
		Assert((BlockNumber) fraged_pages->num_pages >= empty_end_pages);
		fraged_pages->num_pages -= empty_end_pages;
		usable_free_size = 0;
		for (i = 0; i < fraged_pages->num_pages; i++)
			usable_free_size += fraged_pages->pagedesc[i]->free;
	}
	else
	{
		fraged_pages->num_pages = 0;
		usable_free_size = 0;
1028
	}
1029

1030 1031
	if (usable_free_size > 0 && num_vtlinks > 0)
	{
Bruce Momjian's avatar
Bruce Momjian committed
1032
		qsort((char *) vtlinks, num_vtlinks, sizeof(VTupleLinkData),
Bruce Momjian's avatar
Bruce Momjian committed
1033
			  vac_cmp_vtlinks);
1034 1035 1036 1037 1038 1039 1040 1041 1042 1043
		vacrelstats->vtlinks = vtlinks;
		vacrelstats->num_vtlinks = num_vtlinks;
	}
	else
	{
		vacrelstats->vtlinks = NULL;
		vacrelstats->num_vtlinks = 0;
		pfree(vtlinks);
	}

Bruce Momjian's avatar
Bruce Momjian committed
1044
	elog(MESSAGE_LEVEL, "Pages %u: Changed %u, reaped %u, Empty %u, New %u; \
1045 1046
Tup %.0f: Vac %.0f, Keep/VTL %.0f/%u, Crash %.0f, UnUsed %.0f, MinLen %lu, MaxLen %lu; \
Re-using: Free/Avail. Space %.0f/%.0f; EndEmpty/Avail. Pages %u/%u. %s",
Bruce Momjian's avatar
Bruce Momjian committed
1047
		 nblocks, changed_pages, vacuum_pages->num_pages, empty_pages,
Bruce Momjian's avatar
Bruce Momjian committed
1048 1049
		 new_pages, num_tuples, tups_vacuumed,
		 nkeep, vacrelstats->num_vtlinks, ncrash,
1050
		 nunused, (unsigned long) min_tlen, (unsigned long) max_tlen,
1051
		 free_size, usable_free_size,
Bruce Momjian's avatar
Bruce Momjian committed
1052
		 empty_end_pages, fraged_pages->num_pages,
Bruce Momjian's avatar
Bruce Momjian committed
1053
		 show_rusage(&ru0));
1054

Bruce Momjian's avatar
Bruce Momjian committed
1055
}
1056

1057 1058

/*
Bruce Momjian's avatar
Bruce Momjian committed
1059
 *	repair_frag() -- try to repair relation's fragmentation
1060
 *
1061 1062
 *		This routine marks dead tuples as unused and tries re-use dead space
 *		by moving tuples (and inserting indices if needed). It constructs
Bruce Momjian's avatar
Bruce Momjian committed
1063
 *		Nvacpagelist list of free-ed pages (moved tuples) and clean indices
1064 1065 1066
 *		for them after committing (in hack-manner - without losing locks
 *		and freeing memory!) current transaction. It truncates relation
 *		if some end-blocks are gone away.
1067 1068
 */
static void
Bruce Momjian's avatar
Bruce Momjian committed
1069
repair_frag(VRelStats *vacrelstats, Relation onerel,
1070 1071
			VacPageList vacuum_pages, VacPageList fraged_pages,
			int nindices, Relation *Irel)
1072
{
1073 1074 1075
	TransactionId myXID;
	CommandId	myCID;
	Buffer		buf,
Bruce Momjian's avatar
Bruce Momjian committed
1076
				cur_buffer;
1077
	BlockNumber	nblocks,
1078
				blkno;
1079 1080
	BlockNumber	last_move_dest_block = 0,
				last_vacuum_block;
1081 1082
	Page		page,
				ToPage = NULL;
1083 1084
	OffsetNumber offnum,
				maxoff,
1085
				newoff,
1086
				max_offset;
1087 1088
	ItemId		itemid,
				newitemid;
Bruce Momjian's avatar
Bruce Momjian committed
1089 1090
	HeapTupleData tuple,
				newtup;
1091 1092 1093 1094
	TupleDesc	tupdesc;
	IndexInfo **indexInfo = NULL;
	Datum		idatum[INDEX_MAX_KEYS];
	char		inulls[INDEX_MAX_KEYS];
1095
	InsertIndexResult iresult;
Bruce Momjian's avatar
Bruce Momjian committed
1096 1097
	VacPageListData Nvacpagelist;
	VacPage		cur_page = NULL,
Bruce Momjian's avatar
Bruce Momjian committed
1098
				last_vacuum_page,
Bruce Momjian's avatar
Bruce Momjian committed
1099 1100
				vacpage,
			   *curpage;
Bruce Momjian's avatar
Bruce Momjian committed
1101
	int			cur_item = 0;
1102
	int			i;
Bruce Momjian's avatar
Bruce Momjian committed
1103 1104 1105 1106
	Size		tuple_len;
	int			num_moved,
				num_fraged_pages,
				vacuumed_pages;
1107
	int			checked_moved,
1108 1109
				num_tuples,
				keep_tuples = 0;
1110
	bool		isempty,
1111 1112
				dowrite,
				chain_tuple_moved;
1113
	VacRUsage	ru0;
1114

1115
	init_rusage(&ru0);
1116 1117 1118 1119

	myXID = GetCurrentTransactionId();
	myCID = GetCurrentCommandId();

1120 1121
	tupdesc = RelationGetDescr(onerel);

1122
	if (Irel != (Relation *) NULL)		/* preparation for index' inserts */
1123
		indexInfo = get_index_desc(onerel, nindices, Irel);
1124

Bruce Momjian's avatar
Bruce Momjian committed
1125 1126
	Nvacpagelist.num_pages = 0;
	num_fraged_pages = fraged_pages->num_pages;
1127
	Assert((BlockNumber) vacuum_pages->num_pages >= vacuum_pages->empty_end_pages);
Bruce Momjian's avatar
Bruce Momjian committed
1128
	vacuumed_pages = vacuum_pages->num_pages - vacuum_pages->empty_end_pages;
1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139
	if (vacuumed_pages > 0)
	{
		/* get last reaped page from vacuum_pages */
		last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
		last_vacuum_block = last_vacuum_page->blkno;
	}
	else
	{
		last_vacuum_page = NULL;
		last_vacuum_block = InvalidBlockNumber;
	}
Bruce Momjian's avatar
Bruce Momjian committed
1140 1141
	cur_buffer = InvalidBuffer;
	num_moved = 0;
1142

Bruce Momjian's avatar
Bruce Momjian committed
1143 1144
	vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
	vacpage->offsets_used = vacpage->offsets_free = 0;
1145

1146 1147
	/*
	 * Scan pages backwards from the last nonempty page, trying to move
Bruce Momjian's avatar
Bruce Momjian committed
1148
	 * tuples down to lower pages.	Quit when we reach a page that we have
1149 1150 1151
	 * moved any tuples onto, or the first page if we haven't moved anything,
	 * or when we find a page we cannot completely empty (this last condition
	 * is handled by "break" statements within the loop).
1152
	 *
Bruce Momjian's avatar
Bruce Momjian committed
1153
	 * NB: this code depends on the vacuum_pages and fraged_pages lists being
1154
	 * in order by blkno.
1155
	 */
1156
	nblocks = vacrelstats->rel_pages;
Bruce Momjian's avatar
Bruce Momjian committed
1157
	for (blkno = nblocks - vacuum_pages->empty_end_pages - 1;
1158 1159
		 blkno > last_move_dest_block;
		 blkno--)
1160
	{
1161 1162 1163 1164 1165
		/*
		 * Forget fraged_pages pages at or after this one; they're no longer
		 * useful as move targets, since we only want to move down.  Note
		 * that since we stop the outer loop at last_move_dest_block, pages
		 * removed here cannot have had anything moved onto them already.
1166 1167 1168 1169
		 *
		 * Also note that we don't change the stored fraged_pages list,
		 * only our local variable num_fraged_pages; so the forgotten pages
		 * are still available to be loaded into the free space map later.
1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180
		 */
		while (num_fraged_pages > 0 &&
			   fraged_pages->pagedesc[num_fraged_pages-1]->blkno >= blkno)
		{
			Assert(fraged_pages->pagedesc[num_fraged_pages-1]->offsets_used == 0);
			--num_fraged_pages;
		}

		/*
		 * Process this page of relation.
		 */
1181 1182 1183
		buf = ReadBuffer(onerel, blkno);
		page = BufferGetPage(buf);

Bruce Momjian's avatar
Bruce Momjian committed
1184
		vacpage->offsets_free = 0;
1185 1186 1187 1188

		isempty = PageIsEmpty(page);

		dowrite = false;
1189 1190 1191

		/* Is the page in the vacuum_pages list? */
		if (blkno == last_vacuum_block)
1192
		{
1193 1194 1195
			if (last_vacuum_page->offsets_free > 0)
			{
				/* there are dead tuples on this page - clean them */
1196
				Assert(!isempty);
1197 1198 1199
				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
				vacuum_page(onerel, buf, last_vacuum_page);
				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1200 1201 1202 1203
				dowrite = true;
			}
			else
				Assert(isempty);
Bruce Momjian's avatar
Bruce Momjian committed
1204
			--vacuumed_pages;
1205 1206
			if (vacuumed_pages > 0)
			{
Bruce Momjian's avatar
Bruce Momjian committed
1207
				/* get prev reaped page from vacuum_pages */
Bruce Momjian's avatar
Bruce Momjian committed
1208 1209
				last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
				last_vacuum_block = last_vacuum_page->blkno;
1210 1211
			}
			else
1212
			{
1213
				last_vacuum_page = NULL;
1214
				last_vacuum_block = InvalidBlockNumber;
1215
			}
1216 1217 1218 1219 1220 1221 1222 1223 1224
			if (isempty)
			{
				ReleaseBuffer(buf);
				continue;
			}
		}
		else
			Assert(!isempty);

Bruce Momjian's avatar
Bruce Momjian committed
1225 1226
		chain_tuple_moved = false;		/* no one chain-tuple was moved
										 * off this page, yet */
Bruce Momjian's avatar
Bruce Momjian committed
1227
		vacpage->blkno = blkno;
1228 1229 1230 1231 1232 1233 1234 1235 1236 1237
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
		{
			itemid = PageGetItemId(page, offnum);

			if (!ItemIdIsUsed(itemid))
				continue;

1238
			tuple.t_datamcxt = NULL;
1239 1240 1241
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple_len = tuple.t_len = ItemIdGetLength(itemid);
			ItemPointerSet(&(tuple.t_self), blkno, offnum);
1242

1243 1244
			if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
			{
Bruce Momjian's avatar
Bruce Momjian committed
1245
				if ((TransactionId) tuple.t_data->t_cmin != myXID)
1246 1247
					elog(ERROR, "Invalid XID in t_cmin");
				if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1248
					elog(ERROR, "HEAP_MOVED_IN was not expected");
Bruce Momjian's avatar
Bruce Momjian committed
1249 1250 1251

				/*
				 * If this (chain) tuple is moved by me already then I
1252 1253
				 * have to check is it in vacpage or not - i.e. is it
				 * moved while cleaning this page or some previous one.
1254 1255 1256 1257 1258
				 */
				if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
				{
					if (keep_tuples == 0)
						continue;
Bruce Momjian's avatar
Bruce Momjian committed
1259 1260 1261
					if (chain_tuple_moved)		/* some chains was moved
												 * while */
					{			/* cleaning this page */
Bruce Momjian's avatar
Bruce Momjian committed
1262 1263
						Assert(vacpage->offsets_free > 0);
						for (i = 0; i < vacpage->offsets_free; i++)
1264
						{
Bruce Momjian's avatar
Bruce Momjian committed
1265
							if (vacpage->offsets[i] == offnum)
1266 1267
								break;
						}
Bruce Momjian's avatar
Bruce Momjian committed
1268
						if (i >= vacpage->offsets_free) /* not found */
1269
						{
Bruce Momjian's avatar
Bruce Momjian committed
1270
							vacpage->offsets[vacpage->offsets_free++] = offnum;
1271 1272 1273 1274 1275
							keep_tuples--;
						}
					}
					else
					{
Bruce Momjian's avatar
Bruce Momjian committed
1276
						vacpage->offsets[vacpage->offsets_free++] = offnum;
1277 1278 1279 1280 1281
						keep_tuples--;
					}
					continue;
				}
				elog(ERROR, "HEAP_MOVED_OFF was expected");
1282 1283 1284
			}

			/*
Bruce Momjian's avatar
Bruce Momjian committed
1285 1286 1287
			 * If this tuple is in the chain of tuples created in updates
			 * by "recent" transactions then we have to move all chain of
			 * tuples to another places.
1288
			 */
Bruce Momjian's avatar
Bruce Momjian committed
1289
			if ((tuple.t_data->t_infomask & HEAP_UPDATED &&
1290
				 tuple.t_data->t_xmin >= XmaxRecent) ||
Bruce Momjian's avatar
Bruce Momjian committed
1291
				(!(tuple.t_data->t_infomask & HEAP_XMAX_INVALID) &&
1292 1293
				 !(ItemPointerEquals(&(tuple.t_self), &(tuple.t_data->t_ctid)))))
			{
Bruce Momjian's avatar
Bruce Momjian committed
1294 1295 1296 1297 1298 1299 1300
				Buffer		Cbuf = buf;
				Page		Cpage;
				ItemId		Citemid;
				ItemPointerData Ctid;
				HeapTupleData tp = tuple;
				Size		tlen = tuple_len;
				VTupleMove	vtmove = (VTupleMove)
Bruce Momjian's avatar
Bruce Momjian committed
1301
				palloc(100 * sizeof(VTupleMoveData));
Bruce Momjian's avatar
Bruce Momjian committed
1302 1303
				int			num_vtmove = 0;
				int			free_vtmove = 100;
Bruce Momjian's avatar
Bruce Momjian committed
1304
				VacPage		to_vacpage = NULL;
Bruce Momjian's avatar
Bruce Momjian committed
1305 1306 1307
				int			to_item = 0;
				bool		freeCbuf = false;
				int			ti;
1308 1309 1310 1311 1312 1313 1314 1315

				if (vacrelstats->vtlinks == NULL)
					elog(ERROR, "No one parent tuple was found");
				if (cur_buffer != InvalidBuffer)
				{
					WriteBuffer(cur_buffer);
					cur_buffer = InvalidBuffer;
				}
Bruce Momjian's avatar
Bruce Momjian committed
1316

1317
				/*
Bruce Momjian's avatar
Bruce Momjian committed
1318 1319
				 * If this tuple is in the begin/middle of the chain then
				 * we have to move to the end of chain.
1320
				 */
Bruce Momjian's avatar
Bruce Momjian committed
1321 1322
				while (!(tp.t_data->t_infomask & HEAP_XMAX_INVALID) &&
				!(ItemPointerEquals(&(tp.t_self), &(tp.t_data->t_ctid))))
1323 1324 1325 1326 1327
				{
					Ctid = tp.t_data->t_ctid;
					if (freeCbuf)
						ReleaseBuffer(Cbuf);
					freeCbuf = true;
Bruce Momjian's avatar
Bruce Momjian committed
1328 1329
					Cbuf = ReadBuffer(onerel,
									  ItemPointerGetBlockNumber(&Ctid));
1330
					Cpage = BufferGetPage(Cbuf);
Bruce Momjian's avatar
Bruce Momjian committed
1331 1332
					Citemid = PageGetItemId(Cpage,
									  ItemPointerGetOffsetNumber(&Ctid));
1333
					if (!ItemIdIsUsed(Citemid))
1334
					{
Bruce Momjian's avatar
Bruce Momjian committed
1335

1336
						/*
Bruce Momjian's avatar
Bruce Momjian committed
1337 1338 1339 1340 1341
						 * This means that in the middle of chain there
						 * was tuple updated by older (than XmaxRecent)
						 * xaction and this tuple is already deleted by
						 * me. Actually, upper part of chain should be
						 * removed and seems that this should be handled
1342 1343
						 * in scan_heap(), but it's not implemented at the
						 * moment and so we just stop shrinking here.
1344 1345 1346 1347
						 */
						ReleaseBuffer(Cbuf);
						pfree(vtmove);
						vtmove = NULL;
Bruce Momjian's avatar
Bruce Momjian committed
1348
						elog(NOTICE, "Child itemid in update-chain marked as unused - can't continue repair_frag");
1349 1350
						break;
					}
1351
					tp.t_datamcxt = NULL;
1352 1353 1354 1355
					tp.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
					tp.t_self = Ctid;
					tlen = tp.t_len = ItemIdGetLength(Citemid);
				}
1356 1357
				if (vtmove == NULL)
					break;
1358
				/* first, can chain be moved ? */
Bruce Momjian's avatar
Bruce Momjian committed
1359
				for (;;)
1360
				{
Bruce Momjian's avatar
Bruce Momjian committed
1361 1362
					if (to_vacpage == NULL ||
						!enough_space(to_vacpage, tlen))
1363 1364 1365
					{
						for (i = 0; i < num_fraged_pages; i++)
						{
Bruce Momjian's avatar
Bruce Momjian committed
1366
							if (enough_space(fraged_pages->pagedesc[i], tlen))
1367 1368
								break;
						}
Bruce Momjian's avatar
Bruce Momjian committed
1369 1370

						if (i == num_fraged_pages)
Bruce Momjian's avatar
Bruce Momjian committed
1371
						{
1372
							/* can't move item anywhere */
1373 1374
							for (i = 0; i < num_vtmove; i++)
							{
Bruce Momjian's avatar
Bruce Momjian committed
1375 1376
								Assert(vtmove[i].vacpage->offsets_used > 0);
								(vtmove[i].vacpage->offsets_used)--;
1377 1378 1379 1380 1381
							}
							num_vtmove = 0;
							break;
						}
						to_item = i;
Bruce Momjian's avatar
Bruce Momjian committed
1382
						to_vacpage = fraged_pages->pagedesc[to_item];
1383
					}
Bruce Momjian's avatar
Bruce Momjian committed
1384 1385 1386 1387
					to_vacpage->free -= MAXALIGN(tlen);
					if (to_vacpage->offsets_used >= to_vacpage->offsets_free)
						to_vacpage->free -= MAXALIGN(sizeof(ItemIdData));
					(to_vacpage->offsets_used)++;
1388 1389 1390
					if (free_vtmove == 0)
					{
						free_vtmove = 1000;
Bruce Momjian's avatar
Bruce Momjian committed
1391 1392 1393
						vtmove = (VTupleMove) repalloc(vtmove,
											 (free_vtmove + num_vtmove) *
												 sizeof(VTupleMoveData));
1394 1395
					}
					vtmove[num_vtmove].tid = tp.t_self;
Bruce Momjian's avatar
Bruce Momjian committed
1396 1397
					vtmove[num_vtmove].vacpage = to_vacpage;
					if (to_vacpage->offsets_used == 1)
1398 1399 1400 1401 1402
						vtmove[num_vtmove].cleanVpd = true;
					else
						vtmove[num_vtmove].cleanVpd = false;
					free_vtmove--;
					num_vtmove++;
Bruce Momjian's avatar
Bruce Momjian committed
1403

Bruce Momjian's avatar
Bruce Momjian committed
1404
					/* All done ? */
Bruce Momjian's avatar
Bruce Momjian committed
1405 1406
					if (!(tp.t_data->t_infomask & HEAP_UPDATED) ||
						tp.t_data->t_xmin < XmaxRecent)
1407
						break;
Bruce Momjian's avatar
Bruce Momjian committed
1408

Bruce Momjian's avatar
Bruce Momjian committed
1409
					/* Well, try to find tuple with old row version */
Bruce Momjian's avatar
Bruce Momjian committed
1410
					for (;;)
1411
					{
Bruce Momjian's avatar
Bruce Momjian committed
1412 1413 1414 1415 1416 1417
						Buffer		Pbuf;
						Page		Ppage;
						ItemId		Pitemid;
						HeapTupleData Ptp;
						VTupleLinkData vtld,
								   *vtlp;
1418 1419

						vtld.new_tid = tp.t_self;
Bruce Momjian's avatar
Bruce Momjian committed
1420
						vtlp = (VTupleLink)
1421 1422
							vac_bsearch((void *) &vtld,
										(void *) (vacrelstats->vtlinks),
1423 1424 1425
										vacrelstats->num_vtlinks,
										sizeof(VTupleLinkData),
										vac_cmp_vtlinks);
1426 1427 1428
						if (vtlp == NULL)
							elog(ERROR, "Parent tuple was not found");
						tp.t_self = vtlp->this_tid;
Bruce Momjian's avatar
Bruce Momjian committed
1429
						Pbuf = ReadBuffer(onerel,
1430 1431
								ItemPointerGetBlockNumber(&(tp.t_self)));
						Ppage = BufferGetPage(Pbuf);
Bruce Momjian's avatar
Bruce Momjian committed
1432 1433
						Pitemid = PageGetItemId(Ppage,
							   ItemPointerGetOffsetNumber(&(tp.t_self)));
1434 1435
						if (!ItemIdIsUsed(Pitemid))
							elog(ERROR, "Parent itemid marked as unused");
1436
						Ptp.t_datamcxt = NULL;
1437
						Ptp.t_data = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
1438
						Assert(ItemPointerEquals(&(vtld.new_tid),
Bruce Momjian's avatar
Bruce Momjian committed
1439 1440
												 &(Ptp.t_data->t_ctid)));

1441
						/*
Bruce Momjian's avatar
Bruce Momjian committed
1442 1443 1444 1445 1446 1447 1448 1449 1450
						 * Read above about cases when
						 * !ItemIdIsUsed(Citemid) (child item is
						 * removed)... Due to the fact that at the moment
						 * we don't remove unuseful part of update-chain,
						 * it's possible to get too old parent row here.
						 * Like as in the case which caused this problem,
						 * we stop shrinking here. I could try to find
						 * real parent row but want not to do it because
						 * of real solution will be implemented anyway,
1451 1452
						 * latter, and we are too close to 6.5 release. -
						 * vadim 06/11/99
1453 1454 1455 1456 1457 1458 1459 1460 1461
						 */
						if (Ptp.t_data->t_xmax != tp.t_data->t_xmin)
						{
							if (freeCbuf)
								ReleaseBuffer(Cbuf);
							freeCbuf = false;
							ReleaseBuffer(Pbuf);
							for (i = 0; i < num_vtmove; i++)
							{
Bruce Momjian's avatar
Bruce Momjian committed
1462 1463
								Assert(vtmove[i].vacpage->offsets_used > 0);
								(vtmove[i].vacpage->offsets_used)--;
1464 1465
							}
							num_vtmove = 0;
Bruce Momjian's avatar
Bruce Momjian committed
1466
							elog(NOTICE, "Too old parent tuple found - can't continue repair_frag");
1467 1468
							break;
						}
Bruce Momjian's avatar
Bruce Momjian committed
1469 1470 1471
#ifdef NOT_USED					/* I'm not sure that this will wotk
								 * properly... */

1472
						/*
Bruce Momjian's avatar
Bruce Momjian committed
1473 1474 1475 1476
						 * If this tuple is updated version of row and it
						 * was created by the same transaction then no one
						 * is interested in this tuple - mark it as
						 * removed.
1477
						 */
Bruce Momjian's avatar
Bruce Momjian committed
1478
						if (Ptp.t_data->t_infomask & HEAP_UPDATED &&
1479 1480
							Ptp.t_data->t_xmin == Ptp.t_data->t_xmax)
						{
Bruce Momjian's avatar
Bruce Momjian committed
1481 1482 1483 1484
							TransactionIdStore(myXID,
								(TransactionId *) &(Ptp.t_data->t_cmin));
							Ptp.t_data->t_infomask &=
								~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
1485 1486 1487 1488
							Ptp.t_data->t_infomask |= HEAP_MOVED_OFF;
							WriteBuffer(Pbuf);
							continue;
						}
1489
#endif
1490
						tp.t_datamcxt = Ptp.t_datamcxt;
1491 1492 1493 1494 1495 1496 1497 1498
						tp.t_data = Ptp.t_data;
						tlen = tp.t_len = ItemIdGetLength(Pitemid);
						if (freeCbuf)
							ReleaseBuffer(Cbuf);
						Cbuf = Pbuf;
						freeCbuf = true;
						break;
					}
1499 1500
					if (num_vtmove == 0)
						break;
1501 1502 1503
				}
				if (freeCbuf)
					ReleaseBuffer(Cbuf);
Bruce Momjian's avatar
Bruce Momjian committed
1504
				if (num_vtmove == 0)	/* chain can't be moved */
1505 1506 1507 1508 1509 1510 1511
				{
					pfree(vtmove);
					break;
				}
				ItemPointerSetInvalid(&Ctid);
				for (ti = 0; ti < num_vtmove; ti++)
				{
1512
					VacPage		destvacpage = vtmove[ti].vacpage;
1513

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1514
					/* Get page to move from */
1515
					tuple.t_self = vtmove[ti].tid;
Bruce Momjian's avatar
Bruce Momjian committed
1516 1517
					Cbuf = ReadBuffer(onerel,
							 ItemPointerGetBlockNumber(&(tuple.t_self)));
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1518 1519 1520 1521 1522 1523 1524 1525 1526

					/* Get page to move to */
					cur_buffer = ReadBuffer(onerel, destvacpage->blkno);

					LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
					if (cur_buffer != Cbuf)
						LockBuffer(Cbuf, BUFFER_LOCK_EXCLUSIVE);

					ToPage = BufferGetPage(cur_buffer);
1527
					Cpage = BufferGetPage(Cbuf);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1528

Bruce Momjian's avatar
Bruce Momjian committed
1529
					Citemid = PageGetItemId(Cpage,
1530
							ItemPointerGetOffsetNumber(&(tuple.t_self)));
1531
					tuple.t_datamcxt = NULL;
1532 1533
					tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
					tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
1534 1535 1536 1537 1538 1539 1540 1541 1542

					/*
					 * make a copy of the source tuple, and then mark the
					 * source tuple MOVED_OFF.
					 */
					heap_copytuple_with_tuple(&tuple, &newtup);

					RelationInvalidateHeapTuple(onerel, &tuple);

1543 1544 1545
					/* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */
					START_CRIT_SECTION();

1546 1547 1548 1549 1550
					TransactionIdStore(myXID, (TransactionId *) &(tuple.t_data->t_cmin));
					tuple.t_data->t_infomask &=
						~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
					tuple.t_data->t_infomask |= HEAP_MOVED_OFF;

1551 1552 1553
					/*
					 * If this page was not used before - clean it.
					 *
1554 1555
					 * NOTE: a nasty bug used to lurk here.  It is possible
					 * for the source and destination pages to be the same
1556 1557 1558 1559 1560 1561 1562
					 * (since this tuple-chain member can be on a page
					 * lower than the one we're currently processing in
					 * the outer loop).  If that's true, then after
					 * vacuum_page() the source tuple will have been
					 * moved, and tuple.t_data will be pointing at
					 * garbage.  Therefore we must do everything that uses
					 * tuple.t_data BEFORE this step!!
1563
					 *
1564
					 * This path is different from the other callers of
1565 1566
					 * vacuum_page, because we have already incremented
					 * the vacpage's offsets_used field to account for the
1567
					 * tuple(s) we expect to move onto the page. Therefore
1568 1569 1570 1571
					 * vacuum_page's check for offsets_used == 0 is wrong.
					 * But since that's a good debugging check for all
					 * other callers, we work around it here rather than
					 * remove it.
1572
					 */
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1573
					if (!PageIsEmpty(ToPage) && vtmove[ti].cleanVpd)
1574
					{
Bruce Momjian's avatar
Bruce Momjian committed
1575
						int			sv_offsets_used = destvacpage->offsets_used;
1576

Bruce Momjian's avatar
Bruce Momjian committed
1577
						destvacpage->offsets_used = 0;
1578
						vacuum_page(onerel, cur_buffer, destvacpage);
Bruce Momjian's avatar
Bruce Momjian committed
1579
						destvacpage->offsets_used = sv_offsets_used;
1580
					}
1581 1582 1583 1584 1585

					/*
					 * Update the state of the copied tuple, and store it
					 * on the destination page.
					 */
Bruce Momjian's avatar
Bruce Momjian committed
1586 1587 1588
					TransactionIdStore(myXID, (TransactionId *) &(newtup.t_data->t_cmin));
					newtup.t_data->t_infomask &=
						~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF);
1589 1590
					newtup.t_data->t_infomask |= HEAP_MOVED_IN;
					newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
Bruce Momjian's avatar
Bruce Momjian committed
1591
										 InvalidOffsetNumber, LP_USED);
1592 1593
					if (newoff == InvalidOffsetNumber)
					{
1594
						elog(STOP, "moving chain: failed to add item with len = %lu to page %u",
1595
						  (unsigned long) tuple_len, destvacpage->blkno);
1596 1597 1598
					}
					newitemid = PageGetItemId(ToPage, newoff);
					pfree(newtup.t_data);
1599
					newtup.t_datamcxt = NULL;
1600
					newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
Bruce Momjian's avatar
Bruce Momjian committed
1601
					ItemPointerSet(&(newtup.t_self), destvacpage->blkno, newoff);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1602 1603

					{
1604 1605 1606
						XLogRecPtr	recptr =
						log_heap_move(onerel, Cbuf, tuple.t_self,
									  cur_buffer, &newtup);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1607 1608 1609 1610 1611 1612 1613 1614 1615

						if (Cbuf != cur_buffer)
						{
							PageSetLSN(Cpage, recptr);
							PageSetSUI(Cpage, ThisStartUpID);
						}
						PageSetLSN(ToPage, recptr);
						PageSetSUI(ToPage, ThisStartUpID);
					}
1616
					END_CRIT_SECTION();
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1617

1618
					if (destvacpage->blkno > last_move_dest_block)
Bruce Momjian's avatar
Bruce Momjian committed
1619
						last_move_dest_block = destvacpage->blkno;
Bruce Momjian's avatar
Bruce Momjian committed
1620

1621
					/*
1622
					 * Set new tuple's t_ctid pointing to itself for last
1623 1624
					 * tuple in chain, and to next tuple in chain
					 * otherwise.
1625 1626 1627 1628 1629 1630 1631 1632
					 */
					if (!ItemPointerIsValid(&Ctid))
						newtup.t_data->t_ctid = newtup.t_self;
					else
						newtup.t_data->t_ctid = Ctid;
					Ctid = newtup.t_self;

					num_moved++;
Bruce Momjian's avatar
Bruce Momjian committed
1633

1634 1635 1636 1637
					/*
					 * Remember that we moved tuple from the current page
					 * (corresponding index tuple will be cleaned).
					 */
1638
					if (Cbuf == buf)
Bruce Momjian's avatar
Bruce Momjian committed
1639
						vacpage->offsets[vacpage->offsets_free++] =
Bruce Momjian's avatar
Bruce Momjian committed
1640
							ItemPointerGetOffsetNumber(&(tuple.t_self));
1641 1642
					else
						keep_tuples++;
1643

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1644 1645 1646 1647
					LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
					if (cur_buffer != Cbuf)
						LockBuffer(Cbuf, BUFFER_LOCK_UNLOCK);

1648 1649
					if (Irel != (Relation *) NULL)
					{
1650

1651 1652
						/*
						 * XXX using CurrentMemoryContext here means
1653 1654
						 * intra-vacuum memory leak for functional
						 * indexes. Should fix someday.
1655 1656
						 *
						 * XXX This code fails to handle partial indexes!
1657 1658
						 * Probably should change it to use
						 * ExecOpenIndices.
1659 1660
						 */
						for (i = 0; i < nindices; i++)
1661
						{
1662
							FormIndexDatum(indexInfo[i],
Bruce Momjian's avatar
Bruce Momjian committed
1663 1664
										   &newtup,
										   tupdesc,
1665
										   CurrentMemoryContext,
Bruce Momjian's avatar
Bruce Momjian committed
1666
										   idatum,
1667
										   inulls);
1668 1669 1670 1671 1672 1673 1674 1675 1676 1677
							iresult = index_insert(Irel[i],
												   idatum,
												   inulls,
												   &newtup.t_self,
												   onerel);
							if (iresult)
								pfree(iresult);
						}
					}
					WriteBuffer(cur_buffer);
1678
					WriteBuffer(Cbuf);
1679 1680 1681
				}
				cur_buffer = InvalidBuffer;
				pfree(vtmove);
1682
				chain_tuple_moved = true;
1683 1684 1685
				continue;
			}

1686
			/* try to find new page for this tuple */
Bruce Momjian's avatar
Bruce Momjian committed
1687
			if (cur_buffer == InvalidBuffer ||
Bruce Momjian's avatar
Bruce Momjian committed
1688
				!enough_space(cur_page, tuple_len))
1689
			{
Bruce Momjian's avatar
Bruce Momjian committed
1690
				if (cur_buffer != InvalidBuffer)
1691
				{
Bruce Momjian's avatar
Bruce Momjian committed
1692 1693
					WriteBuffer(cur_buffer);
					cur_buffer = InvalidBuffer;
1694
				}
Bruce Momjian's avatar
Bruce Momjian committed
1695
				for (i = 0; i < num_fraged_pages; i++)
1696
				{
Bruce Momjian's avatar
Bruce Momjian committed
1697
					if (enough_space(fraged_pages->pagedesc[i], tuple_len))
1698 1699
						break;
				}
Bruce Momjian's avatar
Bruce Momjian committed
1700
				if (i == num_fraged_pages)
1701
					break;		/* can't move item anywhere */
Bruce Momjian's avatar
Bruce Momjian committed
1702
				cur_item = i;
Bruce Momjian's avatar
Bruce Momjian committed
1703 1704
				cur_page = fraged_pages->pagedesc[cur_item];
				cur_buffer = ReadBuffer(onerel, cur_page->blkno);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1705
				LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
Bruce Momjian's avatar
Bruce Momjian committed
1706
				ToPage = BufferGetPage(cur_buffer);
1707
				/* if this page was not used before - clean it */
Bruce Momjian's avatar
Bruce Momjian committed
1708
				if (!PageIsEmpty(ToPage) && cur_page->offsets_used == 0)
1709
					vacuum_page(onerel, cur_buffer, cur_page);
1710
			}
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1711 1712 1713 1714
			else
				LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);

			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1715 1716

			/* copy tuple */
1717
			heap_copytuple_with_tuple(&tuple, &newtup);
1718

1719
			RelationInvalidateHeapTuple(onerel, &tuple);
1720

1721 1722 1723
			/* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */
			START_CRIT_SECTION();

Bruce Momjian's avatar
Bruce Momjian committed
1724 1725 1726
			/*
			 * Mark new tuple as moved_in by vacuum and store vacuum XID
			 * in t_cmin !!!
1727
			 */
Bruce Momjian's avatar
Bruce Momjian committed
1728 1729 1730
			TransactionIdStore(myXID, (TransactionId *) &(newtup.t_data->t_cmin));
			newtup.t_data->t_infomask &=
				~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF);
1731
			newtup.t_data->t_infomask |= HEAP_MOVED_IN;
1732 1733

			/* add tuple to the page */
1734
			newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
1735 1736 1737
								 InvalidOffsetNumber, LP_USED);
			if (newoff == InvalidOffsetNumber)
			{
1738 1739 1740
				elog(STOP, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)",
					 (unsigned long) tuple_len,
					 cur_page->blkno, (unsigned long) cur_page->free,
1741
					 cur_page->offsets_used, cur_page->offsets_free);
1742 1743
			}
			newitemid = PageGetItemId(ToPage, newoff);
1744
			pfree(newtup.t_data);
1745
			newtup.t_datamcxt = NULL;
1746
			newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
Bruce Momjian's avatar
Bruce Momjian committed
1747
			ItemPointerSet(&(newtup.t_data->t_ctid), cur_page->blkno, newoff);
1748
			newtup.t_self = newtup.t_data->t_ctid;
1749

Bruce Momjian's avatar
Bruce Momjian committed
1750 1751 1752
			/*
			 * Mark old tuple as moved_off by vacuum and store vacuum XID
			 * in t_cmin !!!
1753
			 */
Bruce Momjian's avatar
Bruce Momjian committed
1754 1755 1756
			TransactionIdStore(myXID, (TransactionId *) &(tuple.t_data->t_cmin));
			tuple.t_data->t_infomask &=
				~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
1757
			tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
1758

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1759
			{
1760 1761 1762
				XLogRecPtr	recptr =
				log_heap_move(onerel, buf, tuple.t_self,
							  cur_buffer, &newtup);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1763 1764 1765 1766 1767 1768

				PageSetLSN(page, recptr);
				PageSetSUI(page, ThisStartUpID);
				PageSetLSN(ToPage, recptr);
				PageSetSUI(ToPage, ThisStartUpID);
			}
1769
			END_CRIT_SECTION();
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1770

Bruce Momjian's avatar
Bruce Momjian committed
1771
			cur_page->offsets_used++;
Bruce Momjian's avatar
Bruce Momjian committed
1772
			num_moved++;
Bruce Momjian's avatar
Bruce Momjian committed
1773
			cur_page->free = ((PageHeader) ToPage)->pd_upper - ((PageHeader) ToPage)->pd_lower;
1774
			if (cur_page->blkno > last_move_dest_block)
Bruce Momjian's avatar
Bruce Momjian committed
1775
				last_move_dest_block = cur_page->blkno;
1776

Bruce Momjian's avatar
Bruce Momjian committed
1777
			vacpage->offsets[vacpage->offsets_free++] = offnum;
1778

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1779 1780 1781
			LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);

1782 1783 1784
			/* insert index' tuples if needed */
			if (Irel != (Relation *) NULL)
			{
1785

1786
				/*
1787 1788
				 * XXX using CurrentMemoryContext here means intra-vacuum
				 * memory leak for functional indexes. Should fix someday.
1789
				 *
1790 1791
				 * XXX This code fails to handle partial indexes! Probably
				 * should change it to use ExecOpenIndices.
1792 1793
				 */
				for (i = 0; i < nindices; i++)
1794
				{
1795
					FormIndexDatum(indexInfo[i],
1796
								   &newtup,
1797
								   tupdesc,
1798
								   CurrentMemoryContext,
1799
								   idatum,
1800
								   inulls);
Bruce Momjian's avatar
Bruce Momjian committed
1801
					iresult = index_insert(Irel[i],
1802 1803
										   idatum,
										   inulls,
1804
										   &newtup.t_self,
1805 1806 1807 1808 1809 1810
										   onerel);
					if (iresult)
						pfree(iresult);
				}
			}

Bruce Momjian's avatar
Bruce Momjian committed
1811
		}						/* walk along page */
1812

1813 1814
		if (offnum < maxoff && keep_tuples > 0)
		{
Bruce Momjian's avatar
Bruce Momjian committed
1815
			OffsetNumber off;
1816 1817

			for (off = OffsetNumberNext(offnum);
Bruce Momjian's avatar
Bruce Momjian committed
1818 1819
				 off <= maxoff;
				 off = OffsetNumberNext(off))
1820 1821 1822 1823
			{
				itemid = PageGetItemId(page, off);
				if (!ItemIdIsUsed(itemid))
					continue;
1824
				tuple.t_datamcxt = NULL;
1825 1826 1827
				tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
				if (tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)
					continue;
Bruce Momjian's avatar
Bruce Momjian committed
1828
				if ((TransactionId) tuple.t_data->t_cmin != myXID)
1829 1830 1831 1832 1833
					elog(ERROR, "Invalid XID in t_cmin (4)");
				if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
					elog(ERROR, "HEAP_MOVED_IN was not expected (2)");
				if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
				{
Bruce Momjian's avatar
Bruce Momjian committed
1834
					/* some chains was moved while */
Bruce Momjian's avatar
Bruce Momjian committed
1835 1836
					if (chain_tuple_moved)
					{			/* cleaning this page */
Bruce Momjian's avatar
Bruce Momjian committed
1837 1838
						Assert(vacpage->offsets_free > 0);
						for (i = 0; i < vacpage->offsets_free; i++)
1839
						{
Bruce Momjian's avatar
Bruce Momjian committed
1840
							if (vacpage->offsets[i] == off)
1841 1842
								break;
						}
Bruce Momjian's avatar
Bruce Momjian committed
1843
						if (i >= vacpage->offsets_free) /* not found */
1844
						{
Bruce Momjian's avatar
Bruce Momjian committed
1845
							vacpage->offsets[vacpage->offsets_free++] = off;
1846 1847 1848 1849 1850 1851
							Assert(keep_tuples > 0);
							keep_tuples--;
						}
					}
					else
					{
Bruce Momjian's avatar
Bruce Momjian committed
1852
						vacpage->offsets[vacpage->offsets_free++] = off;
1853 1854 1855 1856 1857 1858 1859
						Assert(keep_tuples > 0);
						keep_tuples--;
					}
				}
			}
		}

Bruce Momjian's avatar
Bruce Momjian committed
1860
		if (vacpage->offsets_free > 0)	/* some tuples were moved */
1861
		{
1862 1863
			if (chain_tuple_moved)		/* else - they are ordered */
			{
Bruce Momjian's avatar
Bruce Momjian committed
1864
				qsort((char *) (vacpage->offsets), vacpage->offsets_free,
Bruce Momjian's avatar
Bruce Momjian committed
1865
					  sizeof(OffsetNumber), vac_cmp_offno);
1866
			}
1867
			vpage_insert(&Nvacpagelist, copy_vac_page(vacpage));
1868
			WriteBuffer(buf);
1869
		}
1870 1871 1872 1873
		else if (dowrite)
			WriteBuffer(buf);
		else
			ReleaseBuffer(buf);
1874

1875 1876 1877 1878 1879 1880 1881
		if (offnum <= maxoff)
			break;				/* some item(s) left */

	}							/* walk along relation */

	blkno++;					/* new number of blocks */

Bruce Momjian's avatar
Bruce Momjian committed
1882
	if (cur_buffer != InvalidBuffer)
1883
	{
Bruce Momjian's avatar
Bruce Momjian committed
1884 1885
		Assert(num_moved > 0);
		WriteBuffer(cur_buffer);
1886
	}
1887

Bruce Momjian's avatar
Bruce Momjian committed
1888
	if (num_moved > 0)
1889
	{
1890
		/*
1891 1892 1893 1894 1895
		 * We have to commit our tuple movings before we truncate the
		 * relation.  Ideally we should do Commit/StartTransactionCommand
		 * here, relying on the session-level table lock to protect our
		 * exclusive access to the relation.  However, that would require
		 * a lot of extra code to close and re-open the relation, indices,
1896 1897
		 * etc.  For now, a quick hack: record status of current
		 * transaction as committed, and continue.
1898
		 */
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1899
		RecordTransactionCommit();
1900
	}
1901 1902

	/*
1903 1904 1905 1906 1907
	 * We are not going to move any more tuples across pages, but we still
	 * need to apply vacuum_page to compact free space in the remaining
	 * pages in vacuum_pages list.  Note that some of these pages may also
	 * be in the fraged_pages list, and may have had tuples moved onto them;
	 * if so, we already did vacuum_page and needn't do it again.
1908
	 */
1909 1910 1911
	for (i = 0, curpage = vacuum_pages->pagedesc;
		 i < vacuumed_pages;
		 i++, curpage++)
1912
	{
1913
		Assert((*curpage)->blkno < blkno);
1914
		if ((*curpage)->offsets_used == 0)
1915
		{
1916 1917 1918 1919
			/* this page was not used as a move target, so must clean it */
			buf = ReadBuffer(onerel, (*curpage)->blkno);
			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
			page = BufferGetPage(buf);
1920
			if (!PageIsEmpty(page))
1921
				vacuum_page(onerel, buf, *curpage);
1922 1923
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
			WriteBuffer(buf);
1924
		}
1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955
	}

	/*
	 * Now scan all the pages that we moved tuples onto and update
	 * tuple status bits.  This is not really necessary, but will save time
	 * for future transactions examining these tuples.
	 *
	 * XXX Notice that this code fails to clear HEAP_MOVED_OFF tuples from
	 * pages that were move source pages but not move dest pages.  One also
	 * wonders whether it wouldn't be better to skip this step and let the
	 * tuple status updates happen someplace that's not holding an exclusive
	 * lock on the relation.
	 */
	checked_moved = 0;
	for (i = 0, curpage = fraged_pages->pagedesc;
		 i < num_fraged_pages;
		 i++, curpage++)
	{
		Assert((*curpage)->blkno < blkno);
		if ((*curpage)->blkno > last_move_dest_block)
			break;				/* no need to scan any further */
		if ((*curpage)->offsets_used == 0)
			continue;			/* this page was never used as a move dest */
		buf = ReadBuffer(onerel, (*curpage)->blkno);
		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
		page = BufferGetPage(buf);
		num_tuples = 0;
		max_offset = PageGetMaxOffsetNumber(page);
		for (newoff = FirstOffsetNumber;
			 newoff <= max_offset;
			 newoff = OffsetNumberNext(newoff))
1956
		{
1957 1958 1959 1960 1961 1962
			itemid = PageGetItemId(page, newoff);
			if (!ItemIdIsUsed(itemid))
				continue;
			tuple.t_datamcxt = NULL;
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
1963
			{
1964 1965 1966
				if ((TransactionId) tuple.t_data->t_cmin != myXID)
					elog(ERROR, "Invalid XID in t_cmin (2)");
				if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1967
				{
1968 1969
					tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
					num_tuples++;
1970
				}
1971 1972 1973 1974
				else if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
					tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
				else
					elog(ERROR, "HEAP_MOVED_OFF/HEAP_MOVED_IN was expected");
1975 1976
			}
		}
1977
		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1978
		WriteBuffer(buf);
1979 1980
		Assert((*curpage)->offsets_used == num_tuples);
		checked_moved += num_tuples;
1981
	}
Bruce Momjian's avatar
Bruce Momjian committed
1982
	Assert(num_moved == checked_moved);
1983

1984
	elog(MESSAGE_LEVEL, "Rel %s: Pages: %u --> %u; Tuple(s) moved: %u. %s",
1985
		 RelationGetRelationName(onerel),
Bruce Momjian's avatar
Bruce Momjian committed
1986
		 nblocks, blkno, num_moved,
Bruce Momjian's avatar
Bruce Momjian committed
1987
		 show_rusage(&ru0));
1988

1989
	/*
1990 1991
	 * Reflect the motion of system tuples to catalog cache here.
	 */
1992
	CommandCounterIncrement();
1993

Bruce Momjian's avatar
Bruce Momjian committed
1994
	if (Nvacpagelist.num_pages > 0)
1995
	{
1996 1997 1998
		/* vacuum indices again if needed */
		if (Irel != (Relation *) NULL)
		{
1999
			VacPage    *vpleft,
2000 2001
					   *vpright,
						vpsave;
2002

Bruce Momjian's avatar
Bruce Momjian committed
2003 2004
			/* re-sort Nvacpagelist.pagedesc */
			for (vpleft = Nvacpagelist.pagedesc,
2005
			vpright = Nvacpagelist.pagedesc + Nvacpagelist.num_pages - 1;
2006 2007 2008 2009 2010 2011
				 vpleft < vpright; vpleft++, vpright--)
			{
				vpsave = *vpleft;
				*vpleft = *vpright;
				*vpright = vpsave;
			}
2012
			Assert(keep_tuples >= 0);
2013
			for (i = 0; i < nindices; i++)
Bruce Momjian's avatar
Bruce Momjian committed
2014
				vacuum_index(&Nvacpagelist, Irel[i],
2015
							 vacrelstats->rel_tuples, keep_tuples);
2016 2017
		}

Bruce Momjian's avatar
Bruce Momjian committed
2018
		/* clean moved tuples from last page in Nvacpagelist list */
2019
		if (vacpage->blkno == (blkno - 1) &&
Bruce Momjian's avatar
Bruce Momjian committed
2020
			vacpage->offsets_free > 0)
2021
		{
2022 2023
			OffsetNumber unbuf[BLCKSZ/sizeof(OffsetNumber)];
			OffsetNumber *unused = unbuf;
2024
			int			uncnt;
2025

Bruce Momjian's avatar
Bruce Momjian committed
2026
			buf = ReadBuffer(onerel, vacpage->blkno);
2027
			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2028
			page = BufferGetPage(buf);
2029
			num_tuples = 0;
2030
			maxoff = PageGetMaxOffsetNumber(page);
2031
			for (offnum = FirstOffsetNumber;
2032
				 offnum <= maxoff;
2033 2034 2035 2036 2037
				 offnum = OffsetNumberNext(offnum))
			{
				itemid = PageGetItemId(page, offnum);
				if (!ItemIdIsUsed(itemid))
					continue;
2038
				tuple.t_datamcxt = NULL;
2039
				tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2040 2041 2042

				if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
				{
Bruce Momjian's avatar
Bruce Momjian committed
2043
					if ((TransactionId) tuple.t_data->t_cmin != myXID)
2044 2045 2046 2047 2048 2049 2050
						elog(ERROR, "Invalid XID in t_cmin (3)");
					if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
					{
						itemid->lp_flags &= ~LP_USED;
						num_tuples++;
					}
					else
2051
						elog(ERROR, "HEAP_MOVED_OFF was expected (2)");
2052 2053
				}

2054
			}
Bruce Momjian's avatar
Bruce Momjian committed
2055
			Assert(vacpage->offsets_free == num_tuples);
2056
			START_CRIT_SECTION();
2057
			uncnt = PageRepairFragmentation(page, unused);
2058
			{
2059
				XLogRecPtr	recptr;
2060 2061 2062

				recptr = log_heap_clean(onerel, buf, (char *) unused,
						  (char *) (&(unused[uncnt])) - (char *) unused);
2063 2064 2065
				PageSetLSN(page, recptr);
				PageSetSUI(page, ThisStartUpID);
			}
2066
			END_CRIT_SECTION();
2067
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2068 2069 2070
			WriteBuffer(buf);
		}

Bruce Momjian's avatar
Bruce Momjian committed
2071
		/* now - free new list of reaped pages */
Bruce Momjian's avatar
Bruce Momjian committed
2072 2073 2074 2075
		curpage = Nvacpagelist.pagedesc;
		for (i = 0; i < Nvacpagelist.num_pages; i++, curpage++)
			pfree(*curpage);
		pfree(Nvacpagelist.pagedesc);
2076 2077
	}

2078 2079
	/*
	 * Flush dirty pages out to disk.  We do this unconditionally, even if
2080 2081 2082
	 * we don't need to truncate, because we want to ensure that all
	 * tuples have correct on-row commit status on disk (see bufmgr.c's
	 * comments for FlushRelationBuffers()).
2083 2084 2085 2086 2087 2088 2089
	 */
	i = FlushRelationBuffers(onerel, blkno);
	if (i < 0)
		elog(ERROR, "VACUUM (repair_frag): FlushRelationBuffers returned %d",
			 i);

	/* truncate relation, if needed */
2090
	if (blkno < nblocks)
2091
	{
Bruce Momjian's avatar
Bruce Momjian committed
2092
		blkno = smgrtruncate(DEFAULT_SMGR, onerel, blkno);
2093 2094
		onerel->rd_nblocks = blkno;	/* update relcache immediately */
		onerel->rd_targblock = InvalidBlockNumber;
2095
		vacrelstats->rel_pages = blkno; /* set new number of blocks */
2096 2097 2098 2099
	}

	if (Irel != (Relation *) NULL)		/* pfree index' allocations */
	{
Bruce Momjian's avatar
Bruce Momjian committed
2100
		close_indices(nindices, Irel);
2101
		pfree(indexInfo);
2102 2103
	}

Bruce Momjian's avatar
Bruce Momjian committed
2104
	pfree(vacpage);
2105 2106
	if (vacrelstats->vtlinks != NULL)
		pfree(vacrelstats->vtlinks);
Bruce Momjian's avatar
Bruce Momjian committed
2107
}
2108 2109

/*
Bruce Momjian's avatar
Bruce Momjian committed
2110
 *	vacuum_heap() -- free dead tuples
2111
 *
2112 2113
 *		This routine marks dead tuples as unused and truncates relation
 *		if there are "empty" end-blocks.
2114 2115
 */
static void
Bruce Momjian's avatar
Bruce Momjian committed
2116
vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
2117
{
2118
	Buffer		buf;
Bruce Momjian's avatar
Bruce Momjian committed
2119
	VacPage    *vacpage;
2120 2121
	BlockNumber	relblocks;
	int			nblocks;
2122
	int			i;
2123

Bruce Momjian's avatar
Bruce Momjian committed
2124
	nblocks = vacuum_pages->num_pages;
2125
	nblocks -= vacuum_pages->empty_end_pages;	/* nothing to do with them */
2126

Bruce Momjian's avatar
Bruce Momjian committed
2127
	for (i = 0, vacpage = vacuum_pages->pagedesc; i < nblocks; i++, vacpage++)
2128
	{
Bruce Momjian's avatar
Bruce Momjian committed
2129
		if ((*vacpage)->offsets_free > 0)
2130
		{
Bruce Momjian's avatar
Bruce Momjian committed
2131
			buf = ReadBuffer(onerel, (*vacpage)->blkno);
2132 2133 2134
			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
			vacuum_page(onerel, buf, *vacpage);
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2135 2136
			WriteBuffer(buf);
		}
2137 2138
	}

2139 2140
	/*
	 * Flush dirty pages out to disk.  We do this unconditionally, even if
2141 2142 2143
	 * we don't need to truncate, because we want to ensure that all
	 * tuples have correct on-row commit status on disk (see bufmgr.c's
	 * comments for FlushRelationBuffers()).
2144
	 */
2145
	Assert(vacrelstats->rel_pages >= vacuum_pages->empty_end_pages);
2146
	relblocks = vacrelstats->rel_pages - vacuum_pages->empty_end_pages;
2147

2148
	i = FlushRelationBuffers(onerel, relblocks);
2149 2150 2151 2152
	if (i < 0)
		elog(ERROR, "VACUUM (vacuum_heap): FlushRelationBuffers returned %d",
			 i);

2153
	/* truncate relation if there are some empty end-pages */
Bruce Momjian's avatar
Bruce Momjian committed
2154
	if (vacuum_pages->empty_end_pages > 0)
2155
	{
2156
		elog(MESSAGE_LEVEL, "Rel %s: Pages: %u --> %u.",
2157
			 RelationGetRelationName(onerel),
2158 2159
			 vacrelstats->rel_pages, relblocks);
		relblocks = smgrtruncate(DEFAULT_SMGR, onerel, relblocks);
2160 2161
		onerel->rd_nblocks = relblocks;	/* update relcache immediately */
		onerel->rd_targblock = InvalidBlockNumber;
2162
		vacrelstats->rel_pages = relblocks;		/* set new number of
2163
												 * blocks */
2164
	}
Bruce Momjian's avatar
Bruce Momjian committed
2165
}
2166 2167

/*
Bruce Momjian's avatar
Bruce Momjian committed
2168
 *	vacuum_page() -- free dead tuples on a page
2169
 *					 and repair its fragmentation.
2170 2171
 */
static void
2172
vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
2173
{
2174 2175
	OffsetNumber unbuf[BLCKSZ/sizeof(OffsetNumber)];
	OffsetNumber *unused = unbuf;
2176 2177 2178 2179
	int			uncnt;
	Page		page = BufferGetPage(buffer);
	ItemId		itemid;
	int			i;
2180

2181
	/* There shouldn't be any tuples moved onto the page yet! */
Bruce Momjian's avatar
Bruce Momjian committed
2182
	Assert(vacpage->offsets_used == 0);
2183

2184
	START_CRIT_SECTION();
Bruce Momjian's avatar
Bruce Momjian committed
2185
	for (i = 0; i < vacpage->offsets_free; i++)
2186
	{
Bruce Momjian's avatar
Bruce Momjian committed
2187
		itemid = &(((PageHeader) page)->pd_linp[vacpage->offsets[i] - 1]);
2188
		itemid->lp_flags &= ~LP_USED;
2189
	}
2190
	uncnt = PageRepairFragmentation(page, unused);
2191
	{
2192
		XLogRecPtr	recptr;
2193 2194 2195

		recptr = log_heap_clean(onerel, buffer, (char *) unused,
						  (char *) (&(unused[uncnt])) - (char *) unused);
2196 2197 2198
		PageSetLSN(page, recptr);
		PageSetSUI(page, ThisStartUpID);
	}
2199
	END_CRIT_SECTION();
Bruce Momjian's avatar
Bruce Momjian committed
2200
}
2201

2202
/*
Bruce Momjian's avatar
Bruce Momjian committed
2203
 *	_scan_index() -- scan one index relation to update statistic.
2204 2205 2206
 *
 */
static void
2207
scan_index(Relation indrel, double num_tuples)
2208
{
2209
	RetrieveIndexResult res;
2210
	IndexScanDesc iscan;
2211 2212
	BlockNumber	nipages;
	double		nitups;
2213
	VacRUsage	ru0;
2214

2215
	init_rusage(&ru0);
2216

2217 2218 2219
	/* walk through the entire index */
	iscan = index_beginscan(indrel, false, 0, (ScanKey) NULL);
	nitups = 0;
2220

2221 2222 2223
	while ((res = index_getnext(iscan, ForwardScanDirection))
		   != (RetrieveIndexResult) NULL)
	{
2224
		nitups += 1;
2225 2226
		pfree(res);
	}
2227

2228
	index_endscan(iscan);
2229

2230 2231
	/* now update statistics in pg_class */
	nipages = RelationGetNumberOfBlocks(indrel);
2232
	vac_update_relstats(RelationGetRelid(indrel), nipages, nitups, false);
2233

2234
	elog(MESSAGE_LEVEL, "Index %s: Pages %u; Tuples %.0f. %s",
2235
		 RelationGetRelationName(indrel), nipages, nitups,
Bruce Momjian's avatar
Bruce Momjian committed
2236
		 show_rusage(&ru0));
2237

Bruce Momjian's avatar
Bruce Momjian committed
2238
	if (nitups != num_tuples)
2239
		elog(NOTICE, "Index %s: NUMBER OF INDEX' TUPLES (%.0f) IS NOT THE SAME AS HEAP' (%.0f).\
2240
\n\tRecreate the index.",
2241
			 RelationGetRelationName(indrel), nitups, num_tuples);
2242

Bruce Momjian's avatar
Bruce Momjian committed
2243
}
2244

2245
/*
Bruce Momjian's avatar
Bruce Momjian committed
2246
 *	vacuum_index() -- vacuum one index relation.
2247
 *
Bruce Momjian's avatar
Bruce Momjian committed
2248
 *		Vpl is the VacPageList of the heap we're currently vacuuming.
2249 2250 2251 2252
 *		It's locked. Indrel is an index relation on the vacuumed heap.
 *		We don't set locks on the index	relation here, since the indexed
 *		access methods support locking at different granularities.
 *		We let them handle it.
2253
 *
2254 2255
 *		Finally, we arrange to update the index relation's statistics in
 *		pg_class.
2256 2257
 */
static void
2258
vacuum_index(VacPageList vacpagelist, Relation indrel,
2259
			 double num_tuples, int keep_tuples)
2260
{
2261
	RetrieveIndexResult res;
2262 2263
	IndexScanDesc iscan;
	ItemPointer heapptr;
2264
	int			tups_vacuumed;
2265 2266
	BlockNumber	num_pages;
	double		num_index_tuples;
Bruce Momjian's avatar
Bruce Momjian committed
2267
	VacPage		vp;
2268
	VacRUsage	ru0;
2269

2270
	init_rusage(&ru0);
2271 2272 2273

	/* walk through the entire index */
	iscan = index_beginscan(indrel, false, 0, (ScanKey) NULL);
2274
	tups_vacuumed = 0;
Bruce Momjian's avatar
Bruce Momjian committed
2275
	num_index_tuples = 0;
2276 2277 2278

	while ((res = index_getnext(iscan, ForwardScanDirection))
		   != (RetrieveIndexResult) NULL)
2279
	{
2280 2281
		heapptr = &res->heap_iptr;

Bruce Momjian's avatar
Bruce Momjian committed
2282
		if ((vp = tid_reaped(heapptr, vacpagelist)) != (VacPage) NULL)
2283
		{
2284
#ifdef NOT_USED
2285 2286 2287 2288 2289
			elog(DEBUG, "<%x,%x> -> <%x,%x>",
				 ItemPointerGetBlockNumber(&(res->index_iptr)),
				 ItemPointerGetOffsetNumber(&(res->index_iptr)),
				 ItemPointerGetBlockNumber(&(res->heap_iptr)),
				 ItemPointerGetOffsetNumber(&(res->heap_iptr)));
2290
#endif
Bruce Momjian's avatar
Bruce Momjian committed
2291
			if (vp->offsets_free == 0)
Bruce Momjian's avatar
Bruce Momjian committed
2292
			{
2293
				elog(NOTICE, "Index %s: pointer to EmptyPage (blk %u off %u) - fixing",
2294
					 RelationGetRelationName(indrel),
Bruce Momjian's avatar
Bruce Momjian committed
2295
					 vp->blkno, ItemPointerGetOffsetNumber(heapptr));
2296
			}
2297
			++tups_vacuumed;
2298 2299 2300
			index_delete(indrel, &res->index_iptr);
		}
		else
2301
			num_index_tuples += 1;
2302

2303 2304
		pfree(res);
	}
2305

2306
	index_endscan(iscan);
2307

2308
	/* now update statistics in pg_class */
Bruce Momjian's avatar
Bruce Momjian committed
2309
	num_pages = RelationGetNumberOfBlocks(indrel);
2310 2311
	vac_update_relstats(RelationGetRelid(indrel),
						num_pages, num_index_tuples, false);
2312

2313
	elog(MESSAGE_LEVEL, "Index %s: Pages %u; Tuples %.0f: Deleted %u. %s",
2314
		 RelationGetRelationName(indrel), num_pages,
2315
		 num_index_tuples - keep_tuples, tups_vacuumed,
Bruce Momjian's avatar
Bruce Momjian committed
2316
		 show_rusage(&ru0));
2317

2318
	if (num_index_tuples != num_tuples + keep_tuples)
2319
		elog(NOTICE, "Index %s: NUMBER OF INDEX' TUPLES (%.0f) IS NOT THE SAME AS HEAP' (%.0f).\
2320
\n\tRecreate the index.",
Bruce Momjian's avatar
Bruce Momjian committed
2321
		  RelationGetRelationName(indrel), num_index_tuples, num_tuples);
2322

Bruce Momjian's avatar
Bruce Momjian committed
2323
}
2324 2325

/*
Bruce Momjian's avatar
Bruce Momjian committed
2326
 *	tid_reaped() -- is a particular tid reaped?
2327
 *
Bruce Momjian's avatar
Bruce Momjian committed
2328
 *		vacpagelist->VacPage_array is sorted in right order.
2329
 */
Bruce Momjian's avatar
Bruce Momjian committed
2330 2331
static VacPage
tid_reaped(ItemPointer itemptr, VacPageList vacpagelist)
2332
{
2333 2334
	OffsetNumber ioffno;
	OffsetNumber *voff;
Bruce Momjian's avatar
Bruce Momjian committed
2335
	VacPage		vp,
2336
			   *vpp;
Bruce Momjian's avatar
Bruce Momjian committed
2337
	VacPageData vacpage;
2338

Bruce Momjian's avatar
Bruce Momjian committed
2339
	vacpage.blkno = ItemPointerGetBlockNumber(itemptr);
2340
	ioffno = ItemPointerGetOffsetNumber(itemptr);
2341

Bruce Momjian's avatar
Bruce Momjian committed
2342
	vp = &vacpage;
2343 2344 2345 2346
	vpp = (VacPage *) vac_bsearch((void *) &vp,
								  (void *) (vacpagelist->pagedesc),
								  vacpagelist->num_pages,
								  sizeof(VacPage),
2347
								  vac_cmp_blk);
2348

Bruce Momjian's avatar
Bruce Momjian committed
2349 2350
	if (vpp == (VacPage *) NULL)
		return (VacPage) NULL;
2351

2352 2353
	/* ok - we are on a partially or fully reaped page */
	vp = *vpp;
2354

Bruce Momjian's avatar
Bruce Momjian committed
2355
	if (vp->offsets_free == 0)
2356 2357
	{
		/* this is EmptyPage, so claim all tuples on it are reaped!!! */
2358
		return vp;
2359 2360
	}

2361 2362 2363 2364
	voff = (OffsetNumber *) vac_bsearch((void *) &ioffno,
										(void *) (vp->offsets),
										vp->offsets_free,
										sizeof(OffsetNumber),
2365
										vac_cmp_offno);
2366 2367

	if (voff == (OffsetNumber *) NULL)
Bruce Momjian's avatar
Bruce Momjian committed
2368
		return (VacPage) NULL;
2369

2370
	/* tid is reaped */
2371
	return vp;
Bruce Momjian's avatar
Bruce Momjian committed
2372
}
2373

2374
/*
2375
 *	vac_update_relstats() -- update statistics for one relation
2376
 *
2377 2378 2379
 *		Update the whole-relation statistics that are kept in its pg_class
 *		row.  There are additional stats that will be updated if we are
 *		doing VACUUM ANALYZE, but we always update these stats.
2380
 *
2381 2382
 *		This routine works for both index and heap relation entries in
 *		pg_class.  We violate no-overwrite semantics here by storing new
2383
 *		values for the statistics columns directly into the pg_class
2384
 *		tuple that's already on the page.  The reason for this is that if
2385 2386 2387 2388 2389
 *		we updated these tuples in the usual way, vacuuming pg_class itself
 *		wouldn't work very well --- by the time we got done with a vacuum
 *		cycle, most of the tuples in pg_class would've been obsoleted.
 *		Of course, this only works for fixed-size never-null columns, but
 *		these are.
2390
 */
2391
void
2392
vac_update_relstats(Oid relid, BlockNumber num_pages, double num_tuples,
2393
					bool hasindex)
2394
{
2395
	Relation	rd;
Bruce Momjian's avatar
Bruce Momjian committed
2396
	HeapTupleData rtup;
2397
	HeapTuple	ctup;
Bruce Momjian's avatar
Bruce Momjian committed
2398 2399
	Form_pg_class pgcform;
	Buffer		buffer;
2400

2401 2402 2403
	/*
	 * update number of tuples and number of pages in pg_class
	 */
2404 2405
	rd = heap_openr(RelationRelationName, RowExclusiveLock);

2406 2407 2408
	ctup = SearchSysCache(RELOID,
						  ObjectIdGetDatum(relid),
						  0, 0, 0);
Bruce Momjian's avatar
Bruce Momjian committed
2409
	if (!HeapTupleIsValid(ctup))
2410
		elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
2411
			 relid);
2412

2413
	/* get the buffer cache tuple */
2414
	rtup.t_self = ctup->t_self;
2415
	ReleaseSysCache(ctup);
2416
	heap_fetch(rd, SnapshotNow, &rtup, &buffer, NULL);
Bruce Momjian's avatar
Bruce Momjian committed
2417

2418
	/* overwrite the existing statistics in the tuple */
2419
	pgcform = (Form_pg_class) GETSTRUCT(&rtup);
2420
	pgcform->relpages = (int32) num_pages;
2421
	pgcform->reltuples = num_tuples;
2422 2423
	pgcform->relhasindex = hasindex;

2424 2425 2426 2427 2428
	/* invalidate the tuple in the cache and write the buffer */
	RelationInvalidateHeapTuple(rd, &rtup);
	WriteBuffer(buffer);

	heap_close(rd, RowExclusiveLock);
2429
}
2430

2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469
/*
 * Update the shared Free Space Map with the info we now have about
 * free space in the relation, discarding any old info the map may have.
 */
static void
vac_update_fsm(Relation onerel, VacPageList fraged_pages,
			   BlockNumber rel_pages)
{
	int			nPages = fraged_pages->num_pages;
	int			i;
	BlockNumber *pages;
	Size	   *spaceAvail;

	/* +1 to avoid palloc(0) */
	pages = (BlockNumber *) palloc((nPages + 1) * sizeof(BlockNumber));
	spaceAvail = (Size *) palloc((nPages + 1) * sizeof(Size));

	for (i = 0; i < nPages; i++)
	{
		pages[i] = fraged_pages->pagedesc[i]->blkno;
		spaceAvail[i] = fraged_pages->pagedesc[i]->free;
		/*
		 * fraged_pages may contain entries for pages that we later decided
		 * to truncate from the relation; don't enter them into the map!
		 */
		if (pages[i] >= rel_pages)
		{
			nPages = i;
			break;
		}
	}

	MultiRecordFreeSpace(&onerel->rd_node,
						 0, MaxBlockNumber,
						 nPages, pages, spaceAvail);
	pfree(pages);
	pfree(spaceAvail);
}

2470 2471 2472
/* Copy a VacPage structure */
static VacPage
copy_vac_page(VacPage vacpage)
2473
{
2474
	VacPage		newvacpage;
2475

Bruce Momjian's avatar
Bruce Momjian committed
2476
	/* allocate a VacPageData entry */
2477 2478
	newvacpage = (VacPage) palloc(sizeof(VacPageData) +
								  vacpage->offsets_free * sizeof(OffsetNumber));
2479

2480
	/* fill it in */
Bruce Momjian's avatar
Bruce Momjian committed
2481
	if (vacpage->offsets_free > 0)
2482 2483
		memcpy(newvacpage->offsets, vacpage->offsets,
			   vacpage->offsets_free * sizeof(OffsetNumber));
Bruce Momjian's avatar
Bruce Momjian committed
2484 2485 2486 2487
	newvacpage->blkno = vacpage->blkno;
	newvacpage->free = vacpage->free;
	newvacpage->offsets_used = vacpage->offsets_used;
	newvacpage->offsets_free = vacpage->offsets_free;
2488

2489
	return newvacpage;
Bruce Momjian's avatar
Bruce Momjian committed
2490
}
2491

2492 2493 2494 2495 2496 2497 2498
/*
 * Add a VacPage pointer to a VacPageList.
 *
 *		As a side effect of the way that scan_heap works,
 *		higher pages come after lower pages in the array
 *		(and highest tid on a page is last).
 */
Bruce Momjian's avatar
Bruce Momjian committed
2499 2500
static void
vpage_insert(VacPageList vacpagelist, VacPage vpnew)
2501
{
2502
#define PG_NPAGEDESC 1024
2503

Bruce Momjian's avatar
Bruce Momjian committed
2504 2505
	/* allocate a VacPage entry if needed */
	if (vacpagelist->num_pages == 0)
2506
	{
Bruce Momjian's avatar
Bruce Momjian committed
2507 2508
		vacpagelist->pagedesc = (VacPage *) palloc(PG_NPAGEDESC * sizeof(VacPage));
		vacpagelist->num_allocated_pages = PG_NPAGEDESC;
2509
	}
Bruce Momjian's avatar
Bruce Momjian committed
2510
	else if (vacpagelist->num_pages >= vacpagelist->num_allocated_pages)
2511
	{
Bruce Momjian's avatar
Bruce Momjian committed
2512 2513
		vacpagelist->num_allocated_pages *= 2;
		vacpagelist->pagedesc = (VacPage *) repalloc(vacpagelist->pagedesc, vacpagelist->num_allocated_pages * sizeof(VacPage));
2514
	}
Bruce Momjian's avatar
Bruce Momjian committed
2515 2516
	vacpagelist->pagedesc[vacpagelist->num_pages] = vpnew;
	(vacpagelist->num_pages)++;
2517 2518
}

2519 2520 2521 2522 2523 2524 2525
/*
 * vac_bsearch: just like standard C library routine bsearch(),
 * except that we first test to see whether the target key is outside
 * the range of the table entries.  This case is handled relatively slowly
 * by the normal binary search algorithm (ie, no faster than any other key)
 * but it occurs often enough in VACUUM to be worth optimizing.
 */
2526
static void *
2527 2528
vac_bsearch(const void *key, const void *base,
			size_t nelem, size_t size,
2529
			int (*compar) (const void *, const void *))
2530
{
2531
	int			res;
2532 2533 2534 2535 2536 2537 2538 2539 2540 2541
	const void *last;

	if (nelem == 0)
		return NULL;
	res = compar(key, base);
	if (res < 0)
		return NULL;
	if (res == 0)
		return (void *) base;
	if (nelem > 1)
2542
	{
2543 2544 2545
		last = (const void *) ((const char *) base + (nelem - 1) * size);
		res = compar(key, last);
		if (res > 0)
2546
			return NULL;
2547 2548
		if (res == 0)
			return (void *) last;
2549
	}
2550 2551 2552
	if (nelem <= 2)
		return NULL;			/* already checked 'em all */
	return bsearch(key, base, nelem, size, compar);
Bruce Momjian's avatar
Bruce Momjian committed
2553
}
2554

2555 2556 2557
/*
 * Comparator routines for use with qsort() and bsearch().
 */
2558
static int
Bruce Momjian's avatar
Bruce Momjian committed
2559
vac_cmp_blk(const void *left, const void *right)
2560
{
2561 2562
	BlockNumber lblk,
				rblk;
2563

Bruce Momjian's avatar
Bruce Momjian committed
2564 2565
	lblk = (*((VacPage *) left))->blkno;
	rblk = (*((VacPage *) right))->blkno;
2566

2567
	if (lblk < rblk)
2568
		return -1;
2569
	if (lblk == rblk)
2570 2571
		return 0;
	return 1;
Bruce Momjian's avatar
Bruce Momjian committed
2572
}
2573

2574
static int
Bruce Momjian's avatar
Bruce Momjian committed
2575
vac_cmp_offno(const void *left, const void *right)
2576
{
2577
	if (*(OffsetNumber *) left < *(OffsetNumber *) right)
2578
		return -1;
2579
	if (*(OffsetNumber *) left == *(OffsetNumber *) right)
2580 2581
		return 0;
	return 1;
Bruce Momjian's avatar
Bruce Momjian committed
2582
}
2583

2584
static int
Bruce Momjian's avatar
Bruce Momjian committed
2585
vac_cmp_vtlinks(const void *left, const void *right)
2586
{
Bruce Momjian's avatar
Bruce Momjian committed
2587 2588
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi <
		((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2589
		return -1;
Bruce Momjian's avatar
Bruce Momjian committed
2590 2591
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi >
		((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2592 2593
		return 1;
	/* bi_hi-es are equal */
Bruce Momjian's avatar
Bruce Momjian committed
2594 2595
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo <
		((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2596
		return -1;
Bruce Momjian's avatar
Bruce Momjian committed
2597 2598
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo >
		((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2599 2600
		return 1;
	/* bi_lo-es are equal */
Bruce Momjian's avatar
Bruce Momjian committed
2601 2602
	if (((VTupleLink) left)->new_tid.ip_posid <
		((VTupleLink) right)->new_tid.ip_posid)
2603
		return -1;
Bruce Momjian's avatar
Bruce Momjian committed
2604 2605
	if (((VTupleLink) left)->new_tid.ip_posid >
		((VTupleLink) right)->new_tid.ip_posid)
2606 2607 2608
		return 1;
	return 0;
}
2609

2610

2611
static void
2612
get_indices(Relation relation, int *nindices, Relation **Irel)
2613
{
2614 2615 2616
	List	   *indexoidlist,
			   *indexoidscan;
	int			i;
2617

2618
	indexoidlist = RelationGetIndexList(relation);
2619

2620
	*nindices = length(indexoidlist);
2621

2622 2623 2624 2625
	if (*nindices > 0)
		*Irel = (Relation *) palloc(*nindices * sizeof(Relation));
	else
		*Irel = NULL;
2626

2627 2628
	i = 0;
	foreach(indexoidscan, indexoidlist)
2629
	{
2630
		Oid			indexoid = lfirsti(indexoidscan);
2631

2632 2633
		(*Irel)[i] = index_open(indexoid);
		i++;
2634 2635
	}

2636
	freeList(indexoidlist);
Bruce Momjian's avatar
Bruce Momjian committed
2637
}
2638 2639 2640


static void
Bruce Momjian's avatar
Bruce Momjian committed
2641
close_indices(int nindices, Relation *Irel)
2642 2643
{

2644 2645
	if (Irel == (Relation *) NULL)
		return;
2646

2647 2648 2649
	while (nindices--)
		index_close(Irel[nindices]);
	pfree(Irel);
2650

Bruce Momjian's avatar
Bruce Momjian committed
2651
}
2652 2653


2654 2655 2656 2657 2658
/*
 * Obtain IndexInfo data for each index on the rel
 */
static IndexInfo **
get_index_desc(Relation onerel, int nindices, Relation *Irel)
2659
{
2660
	IndexInfo **indexInfo;
2661
	int			i;
2662
	HeapTuple	cachetuple;
2663

2664
	indexInfo = (IndexInfo **) palloc(nindices * sizeof(IndexInfo *));
2665

2666
	for (i = 0; i < nindices; i++)
2667
	{
2668
		cachetuple = SearchSysCache(INDEXRELID,
2669
							 ObjectIdGetDatum(RelationGetRelid(Irel[i])),
2670
									0, 0, 0);
2671 2672 2673 2674
		if (!HeapTupleIsValid(cachetuple))
			elog(ERROR, "get_index_desc: index %u not found",
				 RelationGetRelid(Irel[i]));
		indexInfo[i] = BuildIndexInfo(cachetuple);
2675
		ReleaseSysCache(cachetuple);
2676 2677
	}

2678
	return indexInfo;
Bruce Momjian's avatar
Bruce Momjian committed
2679
}
2680 2681


2682
static bool
Bruce Momjian's avatar
Bruce Momjian committed
2683
enough_space(VacPage vacpage, Size len)
2684 2685
{

2686
	len = MAXALIGN(len);
2687

Bruce Momjian's avatar
Bruce Momjian committed
2688
	if (len > vacpage->free)
2689
		return false;
2690

Bruce Momjian's avatar
Bruce Momjian committed
2691
	if (vacpage->offsets_used < vacpage->offsets_free)	/* there are free
2692
														 * itemid(s) */
2693
		return true;			/* and len <= free_space */
2694 2695

	/* ok. noff_usd >= noff_free and so we'll have to allocate new itemid */
Bruce Momjian's avatar
Bruce Momjian committed
2696
	if (len + MAXALIGN(sizeof(ItemIdData)) <= vacpage->free)
2697
		return true;
2698

2699
	return false;
2700

Bruce Momjian's avatar
Bruce Momjian committed
2701
}
2702 2703


2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715
/*
 * Initialize usage snapshot.
 */
static void
init_rusage(VacRUsage *ru0)
{
	struct timezone tz;

	getrusage(RUSAGE_SELF, &ru0->ru);
	gettimeofday(&ru0->tv, &tz);
}

2716 2717 2718 2719 2720 2721 2722
/*
 * Compute elapsed time since ru0 usage snapshot, and format into
 * a displayable string.  Result is in a static string, which is
 * tacky, but no one ever claimed that the Postgres backend is
 * threadable...
 */
static char *
2723
show_rusage(VacRUsage *ru0)
2724
{
2725 2726
	static char result[100];
	VacRUsage	ru1;
2727

2728
	init_rusage(&ru1);
2729

2730 2731 2732 2733 2734 2735
	if (ru1.tv.tv_usec < ru0->tv.tv_usec)
	{
		ru1.tv.tv_sec--;
		ru1.tv.tv_usec += 1000000;
	}
	if (ru1.ru.ru_stime.tv_usec < ru0->ru.ru_stime.tv_usec)
2736
	{
2737 2738
		ru1.ru.ru_stime.tv_sec--;
		ru1.ru.ru_stime.tv_usec += 1000000;
2739
	}
2740
	if (ru1.ru.ru_utime.tv_usec < ru0->ru.ru_utime.tv_usec)
2741
	{
2742 2743
		ru1.ru.ru_utime.tv_sec--;
		ru1.ru.ru_utime.tv_usec += 1000000;
2744 2745 2746
	}

	snprintf(result, sizeof(result),
2747 2748 2749 2750 2751 2752 2753
			 "CPU %d.%02ds/%d.%02du sec elapsed %d.%02d sec.",
			 (int) (ru1.ru.ru_stime.tv_sec - ru0->ru.ru_stime.tv_sec),
			 (int) (ru1.ru.ru_stime.tv_usec - ru0->ru.ru_stime.tv_usec) / 10000,
			 (int) (ru1.ru.ru_utime.tv_sec - ru0->ru.ru_utime.tv_sec),
			 (int) (ru1.ru.ru_utime.tv_usec - ru0->ru.ru_utime.tv_usec) / 10000,
			 (int) (ru1.tv.tv_sec - ru0->tv.tv_sec),
			 (int) (ru1.tv.tv_usec - ru0->tv.tv_usec) / 10000);
2754 2755 2756

	return result;
}