cluster.c 34.6 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * cluster.c
4 5 6
 *	  CLUSTER a table on an index.
 *
 * There is hardly anything left of Paul Brown's original implementation...
7 8
 *
 *
9
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
Bruce Momjian's avatar
Add:  
Bruce Momjian committed
10
 * Portions Copyright (c) 1994-5, Regents of the University of California
11 12 13
 *
 *
 * IDENTIFICATION
14
 *	  $PostgreSQL: pgsql/src/backend/commands/cluster.c,v 1.178 2008/10/14 17:19:50 alvherre Exp $
15 16 17
 *
 *-------------------------------------------------------------------------
 */
18
#include "postgres.h"
19

20
#include "access/genam.h"
Bruce Momjian's avatar
Bruce Momjian committed
21
#include "access/heapam.h"
22
#include "access/relscan.h"
23
#include "access/rewriteheap.h"
24
#include "access/transam.h"
25
#include "access/xact.h"
26
#include "catalog/catalog.h"
27
#include "catalog/dependency.h"
Bruce Momjian's avatar
Bruce Momjian committed
28
#include "catalog/heap.h"
29
#include "catalog/index.h"
30
#include "catalog/indexing.h"
31
#include "catalog/namespace.h"
32
#include "catalog/pg_namespace.h"
33
#include "catalog/toasting.h"
34
#include "commands/cluster.h"
35
#include "commands/tablecmds.h"
36
#include "commands/trigger.h"
37
#include "commands/vacuum.h"
38
#include "miscadmin.h"
39
#include "storage/bufmgr.h"
40
#include "storage/procarray.h"
41
#include "utils/acl.h"
42
#include "utils/fmgroids.h"
43
#include "utils/inval.h"
44
#include "utils/lsyscache.h"
45
#include "utils/memutils.h"
46
#include "utils/relcache.h"
47
#include "utils/snapmgr.h"
48
#include "utils/syscache.h"
49
#include "utils/tqual.h"
50

51 52 53

/*
 * This struct is used to pass around the information on tables to be
54 55 56 57 58
 * clustered. We need this so we can make a list of them when invoked without
 * a specific table/index pair.
 */
typedef struct
{
Bruce Momjian's avatar
Bruce Momjian committed
59 60
	Oid			tableOid;
	Oid			indexOid;
61
} RelToCluster;
62

63

64
static void cluster_rel(RelToCluster *rv, bool recheck);
65
static void rebuild_relation(Relation OldHeap, Oid indexOid);
66
static TransactionId copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex);
67 68 69 70 71
static List *get_tables_to_cluster(MemoryContext cluster_context);



/*---------------------------------------------------------------------------
Bruce Momjian's avatar
Bruce Momjian committed
72
 * This cluster code allows for clustering multiple tables at once. Because
73 74 75 76 77 78 79
 * of this, we cannot just run everything on a single transaction, or we
 * would be forced to acquire exclusive locks on all the tables being
 * clustered, simultaneously --- very likely leading to deadlock.
 *
 * To solve this we follow a similar strategy to VACUUM code,
 * clustering each relation in a separate transaction. For this to work,
 * we need to:
Bruce Momjian's avatar
Bruce Momjian committed
80 81 82 83 84 85 86
 *	- provide a separate memory context so that we can pass information in
 *	  a way that survives across transactions
 *	- start a new transaction every time a new relation is clustered
 *	- check for validity of the information on to-be-clustered relations,
 *	  as someone might have deleted a relation behind our back, or
 *	  clustered one on a different index
 *	- end the transaction
87 88 89
 *
 * The single-relation case does not have any such overhead.
 *
Bruce Momjian's avatar
Bruce Momjian committed
90
 * We also allow a relation to be specified without index.	In that case,
91 92 93 94 95
 * the indisclustered bit will be looked up, and an ERROR will be thrown
 * if there is no index with the bit set.
 *---------------------------------------------------------------------------
 */
void
96
cluster(ClusterStmt *stmt, bool isTopLevel)
97 98 99 100
{
	if (stmt->relation != NULL)
	{
		/* This is the single-relation case. */
Bruce Momjian's avatar
Bruce Momjian committed
101 102 103 104
		Oid			tableOid,
					indexOid = InvalidOid;
		Relation	rel;
		RelToCluster rvtc;
105 106

		/* Find and lock the table */
107
		rel = heap_openrv(stmt->relation, AccessExclusiveLock);
108

109
		tableOid = RelationGetRelid(rel);
110 111

		/* Check permissions */
112 113 114
		if (!pg_class_ownercheck(tableOid, GetUserId()))
			aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS,
						   RelationGetRelationName(rel));
115

116
		/*
Bruce Momjian's avatar
Bruce Momjian committed
117 118
		 * Reject clustering a remote temp table ... their local buffer
		 * manager is not going to cope.
119 120 121 122
		 */
		if (isOtherTempNamespace(RelationGetNamespace(rel)))
			ereport(ERROR,
					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
Bruce Momjian's avatar
Bruce Momjian committed
123
			   errmsg("cannot cluster temporary tables of other sessions")));
124

125 126
		if (stmt->indexname == NULL)
		{
127
			ListCell   *index;
128 129

			/* We need to find the index that has indisclustered set. */
Bruce Momjian's avatar
Bruce Momjian committed
130
			foreach(index, RelationGetIndexList(rel))
131
			{
Bruce Momjian's avatar
Bruce Momjian committed
132 133
				HeapTuple	idxtuple;
				Form_pg_index indexForm;
134

135
				indexOid = lfirst_oid(index);
136 137 138 139
				idxtuple = SearchSysCache(INDEXRELID,
										  ObjectIdGetDatum(indexOid),
										  0, 0, 0);
				if (!HeapTupleIsValid(idxtuple))
140
					elog(ERROR, "cache lookup failed for index %u", indexOid);
141 142 143 144 145 146 147 148 149 150 151
				indexForm = (Form_pg_index) GETSTRUCT(idxtuple);
				if (indexForm->indisclustered)
				{
					ReleaseSysCache(idxtuple);
					break;
				}
				ReleaseSysCache(idxtuple);
				indexOid = InvalidOid;
			}

			if (!OidIsValid(indexOid))
152 153 154 155
				ereport(ERROR,
						(errcode(ERRCODE_UNDEFINED_OBJECT),
						 errmsg("there is no previously clustered index for table \"%s\"",
								stmt->relation->relname)));
156 157 158
		}
		else
		{
Bruce Momjian's avatar
Bruce Momjian committed
159 160 161 162
			/*
			 * The index is expected to be in the same namespace as the
			 * relation.
			 */
163 164 165
			indexOid = get_relname_relid(stmt->indexname,
										 rel->rd_rel->relnamespace);
			if (!OidIsValid(indexOid))
166 167
				ereport(ERROR,
						(errcode(ERRCODE_UNDEFINED_OBJECT),
168 169
					   errmsg("index \"%s\" for table \"%s\" does not exist",
							  stmt->indexname, stmt->relation->relname)));
170 171
		}

172
		/* All other checks are done in cluster_rel() */
173 174 175 176 177 178 179 180 181 182 183 184
		rvtc.tableOid = tableOid;
		rvtc.indexOid = indexOid;

		/* close relation, keep lock till commit */
		heap_close(rel, NoLock);

		/* Do the job */
		cluster_rel(&rvtc, false);
	}
	else
	{
		/*
185 186
		 * This is the "multi relation" case. We need to cluster all tables
		 * that have some index with indisclustered set.
187
		 */
Bruce Momjian's avatar
Bruce Momjian committed
188
		MemoryContext cluster_context;
189 190
		List	   *rvs;
		ListCell   *rv;
191 192

		/*
193 194
		 * We cannot run this form of CLUSTER inside a user transaction block;
		 * we'd be holding locks way too long.
195
		 */
196
		PreventTransactionChain(isTopLevel, "CLUSTER");
197 198 199 200

		/*
		 * Create special memory context for cross-transaction storage.
		 *
201 202
		 * Since it is a child of PortalContext, it will go away even in case
		 * of error.
203
		 */
204
		cluster_context = AllocSetContextCreate(PortalContext,
205 206 207 208 209 210
												"Cluster",
												ALLOCSET_DEFAULT_MINSIZE,
												ALLOCSET_DEFAULT_INITSIZE,
												ALLOCSET_DEFAULT_MAXSIZE);

		/*
211 212
		 * Build the list of relations to cluster.	Note that this lives in
		 * cluster_context.
213 214 215 216
		 */
		rvs = get_tables_to_cluster(cluster_context);

		/* Commit to get out of starting transaction */
217
		PopActiveSnapshot();
218
		CommitTransactionCommand();
219 220

		/* Ok, now that we've got them all, cluster them one by one */
Bruce Momjian's avatar
Bruce Momjian committed
221
		foreach(rv, rvs)
222
		{
Bruce Momjian's avatar
Bruce Momjian committed
223
			RelToCluster *rvtc = (RelToCluster *) lfirst(rv);
224 225

			/* Start a new transaction for each relation. */
226
			StartTransactionCommand();
227
			/* functions in indexes may want a snapshot set */
228
			PushActiveSnapshot(GetTransactionSnapshot());
229
			cluster_rel(rvtc, true);
230
			PopActiveSnapshot();
231
			CommitTransactionCommand();
232 233 234
		}

		/* Start a new transaction for the cleanup work. */
235
		StartTransactionCommand();
236

237 238 239 240
		/* Clean up working storage */
		MemoryContextDelete(cluster_context);
	}
}
241

242
/*
243
 * cluster_rel
244
 *
245 246
 * This clusters the table by creating a new, clustered table and
 * swapping the relfilenodes of the new table and the old table, so
Bruce Momjian's avatar
Bruce Momjian committed
247
 * the OID of the original table is preserved.	Thus we do not lose
248
 * GRANT, inheritance nor references to this table (this was a bug
249
 * in releases thru 7.3).
250
 *
251 252 253 254
 * Also create new indexes and swap the filenodes with the old indexes the
 * same way we do for the relation.  Since we are effectively bulk-loading
 * the new table, it's better to create the indexes afterwards than to fill
 * them incrementally while we load the table.
255
 */
256
static void
257
cluster_rel(RelToCluster *rvtc, bool recheck)
258
{
259
	Relation	OldHeap;
260

261 262 263
	/* Check for user-requested abort. */
	CHECK_FOR_INTERRUPTS();

264 265 266 267 268 269 270 271 272 273 274 275
	/*
	 * We grab exclusive access to the target rel and index for the duration
	 * of the transaction.	(This is redundant for the single-transaction
	 * case, since cluster() already did it.)  The index lock is taken inside
	 * check_index_is_clusterable.
	 */
	OldHeap = try_relation_open(rvtc->tableOid, AccessExclusiveLock);

	/* If the table has gone away, we can skip processing it */
	if (!OldHeap)
		return;

276
	/*
277 278
	 * Since we may open a new transaction for each relation, we have to check
	 * that the relation still is what we think it is.
279
	 *
280 281 282
	 * If this is a single-transaction CLUSTER, we can skip these tests. We
	 * *must* skip the one on indisclustered since it would reject an attempt
	 * to cluster a not-previously-clustered index.
283
	 */
284 285
	if (recheck)
	{
Bruce Momjian's avatar
Bruce Momjian committed
286 287
		HeapTuple	tuple;
		Form_pg_index indexForm;
288

289 290 291 292 293 294 295
		/* Check that the user still owns the relation */
		if (!pg_class_ownercheck(rvtc->tableOid, GetUserId()))
		{
			relation_close(OldHeap, AccessExclusiveLock);
			return;
		}

296 297 298 299 300
		/*
		 * Silently skip a temp table for a remote session.  Only doing this
		 * check in the "recheck" case is appropriate (which currently means
		 * somebody is executing a database-wide CLUSTER), because there is
		 * another check in cluster() which will stop any attempt to cluster
Bruce Momjian's avatar
Bruce Momjian committed
301
		 * remote temp tables by name.	There is another check in
302 303 304 305 306 307 308 309 310
		 * check_index_is_clusterable which is redundant, but we leave it for
		 * extra safety.
		 */
		if (isOtherTempNamespace(RelationGetNamespace(OldHeap)))
		{
			relation_close(OldHeap, AccessExclusiveLock);
			return;
		}

311
		/*
312
		 * Check that the index still exists
313 314
		 */
		if (!SearchSysCacheExists(RELOID,
315 316
								  ObjectIdGetDatum(rvtc->indexOid),
								  0, 0, 0))
317 318
		{
			relation_close(OldHeap, AccessExclusiveLock);
319
			return;
320
		}
321

322 323 324
		/*
		 * Check that the index is still the one with indisclustered set.
		 */
325 326 327
		tuple = SearchSysCache(INDEXRELID,
							   ObjectIdGetDatum(rvtc->indexOid),
							   0, 0, 0);
Bruce Momjian's avatar
Bruce Momjian committed
328
		if (!HeapTupleIsValid(tuple))	/* probably can't happen */
329 330 331 332
		{
			relation_close(OldHeap, AccessExclusiveLock);
			return;
		}
333 334 335 336
		indexForm = (Form_pg_index) GETSTRUCT(tuple);
		if (!indexForm->indisclustered)
		{
			ReleaseSysCache(tuple);
337
			relation_close(OldHeap, AccessExclusiveLock);
338 339 340 341 342
			return;
		}
		ReleaseSysCache(tuple);
	}

343
	/* Check index is valid to cluster on */
344
	check_index_is_clusterable(OldHeap, rvtc->indexOid, recheck);
345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360

	/* rebuild_relation does all the dirty work */
	rebuild_relation(OldHeap, rvtc->indexOid);

	/* NB: rebuild_relation does heap_close() on OldHeap */
}

/*
 * Verify that the specified index is a legitimate index to cluster on
 *
 * Side effect: obtains exclusive lock on the index.  The caller should
 * already have exclusive lock on the table, so the index lock is likely
 * redundant, but it seems best to grab it anyway to ensure the index
 * definition can't change under us.
 */
void
361
check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck)
362 363 364
{
	Relation	OldIndex;

365
	OldIndex = index_open(indexOid, AccessExclusiveLock);
366

367
	/*
368
	 * Check that index is in fact an index on the given relation
369
	 */
370
	if (OldIndex->rd_index == NULL ||
371
		OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
372 373 374 375 376
		ereport(ERROR,
				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
				 errmsg("\"%s\" is not an index for table \"%s\"",
						RelationGetRelationName(OldIndex),
						RelationGetRelationName(OldHeap))));
377

378
	/*
379 380 381 382
	 * Disallow clustering on incomplete indexes (those that might not index
	 * every row of the relation).	We could relax this by making a separate
	 * seqscan pass over the table to copy the missing rows, but that seems
	 * expensive and tedious.
383
	 */
384
	if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred))
385 386
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
387 388
				 errmsg("cannot cluster on partial index \"%s\"",
						RelationGetRelationName(OldIndex))));
389

Bruce Momjian's avatar
Bruce Momjian committed
390
	if (!OldIndex->rd_am->amclusterable)
391 392 393 394 395
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
				 errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
						RelationGetRelationName(OldIndex))));

396 397 398 399 400
	if (!OldIndex->rd_am->amindexnulls)
	{
		AttrNumber	colno;

		/*
401 402 403 404
		 * If the AM doesn't index nulls, then it's a partial index unless we
		 * can prove all the rows are non-null.  Note we only need look at the
		 * first column; multicolumn-capable AMs are *required* to index nulls
		 * in columns after the first.
405
		 */
406
		colno = OldIndex->rd_index->indkey.values[0];
407 408 409
		if (colno > 0)
		{
			/* ordinary user attribute */
410
			if (!OldHeap->rd_att->attrs[colno - 1]->attnotnull)
411 412
				ereport(ERROR,
						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
Peter Eisentraut's avatar
Peter Eisentraut committed
413
						 errmsg("cannot cluster on index \"%s\" because access method does not handle null values",
414
								RelationGetRelationName(OldIndex)),
Peter Eisentraut's avatar
Peter Eisentraut committed
415
						 recheck
416
						 ? errhint("You might be able to work around this by marking column \"%s\" NOT NULL, or use ALTER TABLE ... SET WITHOUT CLUSTER to remove the cluster specification from the table.",
417
						 NameStr(OldHeap->rd_att->attrs[colno - 1]->attname))
418
						 : errhint("You might be able to work around this by marking column \"%s\" NOT NULL.",
419
					  NameStr(OldHeap->rd_att->attrs[colno - 1]->attname))));
420 421 422 423 424 425 426
		}
		else if (colno < 0)
		{
			/* system column --- okay, always non-null */
		}
		else
			/* index expression, lose... */
427 428
			ereport(ERROR,
					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
Peter Eisentraut's avatar
Peter Eisentraut committed
429
					 errmsg("cannot cluster on expressional index \"%s\" because its index access method does not handle null values",
430
							RelationGetRelationName(OldIndex))));
431 432
	}

433 434 435 436 437 438 439 440 441 442 443 444 445 446
	/*
	 * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
	 * it might well not contain entries for every heap row, or might not even
	 * be internally consistent.  (But note that we don't check indcheckxmin;
	 * the worst consequence of following broken HOT chains would be that we
	 * might put recently-dead tuples out-of-order in the new table, and there
	 * is little harm in that.)
	 */
	if (!OldIndex->rd_index->indisvalid)
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
				 errmsg("cannot cluster on invalid index \"%s\"",
						RelationGetRelationName(OldIndex))));

447
	/*
448 449 450 451 452
	 * Disallow clustering system relations.  This will definitely NOT work
	 * for shared relations (we have no way to update pg_class rows in other
	 * databases), nor for nailed-in-cache relations (the relfilenode values
	 * for those are hardwired, see relcache.c).  It might work for other
	 * system relations, but I ain't gonna risk it.
453 454
	 */
	if (IsSystemRelation(OldHeap))
455 456 457 458
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
				 errmsg("\"%s\" is a system catalog",
						RelationGetRelationName(OldHeap))));
459

460
	/*
461 462
	 * Don't allow cluster on temp tables of other backends ... their local
	 * buffer manager is not going to cope.
463 464
	 */
	if (isOtherTempNamespace(RelationGetNamespace(OldHeap)))
465 466
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
467
			   errmsg("cannot cluster temporary tables of other sessions")));
468

469
	/*
470 471
	 * Also check for active uses of the relation in the current transaction,
	 * including open scans and pending AFTER trigger events.
472
	 */
473
	CheckTableNotInUse(OldHeap, "CLUSTER");
474

475
	/* Drop relcache refcnt on OldIndex, but keep lock */
476
	index_close(OldIndex, NoLock);
Bruce Momjian's avatar
Bruce Momjian committed
477 478
}

479
/*
480
 * mark_index_clustered: mark the specified index as the one clustered on
481
 *
482 483 484 485 486 487 488 489
 * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
 */
void
mark_index_clustered(Relation rel, Oid indexOid)
{
	HeapTuple	indexTuple;
	Form_pg_index indexForm;
	Relation	pg_index;
490
	ListCell   *index;
491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515

	/*
	 * If the index is already marked clustered, no need to do anything.
	 */
	if (OidIsValid(indexOid))
	{
		indexTuple = SearchSysCache(INDEXRELID,
									ObjectIdGetDatum(indexOid),
									0, 0, 0);
		if (!HeapTupleIsValid(indexTuple))
			elog(ERROR, "cache lookup failed for index %u", indexOid);
		indexForm = (Form_pg_index) GETSTRUCT(indexTuple);

		if (indexForm->indisclustered)
		{
			ReleaseSysCache(indexTuple);
			return;
		}

		ReleaseSysCache(indexTuple);
	}

	/*
	 * Check each index of the relation and set/clear the bit as needed.
	 */
516
	pg_index = heap_open(IndexRelationId, RowExclusiveLock);
517 518 519

	foreach(index, RelationGetIndexList(rel))
	{
Bruce Momjian's avatar
Bruce Momjian committed
520
		Oid			thisIndexOid = lfirst_oid(index);
521 522 523 524 525 526 527 528 529

		indexTuple = SearchSysCacheCopy(INDEXRELID,
										ObjectIdGetDatum(thisIndexOid),
										0, 0, 0);
		if (!HeapTupleIsValid(indexTuple))
			elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
		indexForm = (Form_pg_index) GETSTRUCT(indexTuple);

		/*
530 531
		 * Unset the bit if set.  We know it's wrong because we checked this
		 * earlier.
532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556
		 */
		if (indexForm->indisclustered)
		{
			indexForm->indisclustered = false;
			simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
			CatalogUpdateIndexes(pg_index, indexTuple);
			/* Ensure we see the update in the index's relcache entry */
			CacheInvalidateRelcacheByRelid(thisIndexOid);
		}
		else if (thisIndexOid == indexOid)
		{
			indexForm->indisclustered = true;
			simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
			CatalogUpdateIndexes(pg_index, indexTuple);
			/* Ensure we see the update in the index's relcache entry */
			CacheInvalidateRelcacheByRelid(thisIndexOid);
		}
		heap_freetuple(indexTuple);
	}

	heap_close(pg_index, RowExclusiveLock);
}

/*
 * rebuild_relation: rebuild an existing relation in index order
557 558
 *
 * OldHeap: table to rebuild --- must be opened and exclusive-locked!
559
 * indexOid: index to cluster by
560 561 562
 *
 * NB: this routine closes OldHeap at the right time; caller should not.
 */
563
static void
564
rebuild_relation(Relation OldHeap, Oid indexOid)
Bruce Momjian's avatar
Bruce Momjian committed
565
{
566
	Oid			tableOid = RelationGetRelid(OldHeap);
567
	Oid			tableSpace = OldHeap->rd_rel->reltablespace;
Bruce Momjian's avatar
Bruce Momjian committed
568 569
	Oid			OIDNewHeap;
	char		NewHeapName[NAMEDATALEN];
570
	TransactionId frozenXid;
Bruce Momjian's avatar
Bruce Momjian committed
571
	ObjectAddress object;
572
	Relation	newrel;
Bruce Momjian's avatar
Bruce Momjian committed
573

574 575
	/* Mark the correct index as clustered */
	mark_index_clustered(OldHeap, indexOid);
576 577 578 579

	/* Close relcache entry, but keep lock until transaction commit */
	heap_close(OldHeap, NoLock);

580
	/*
581 582 583 584 585
	 * Create the new heap, using a temporary name in the same namespace as
	 * the existing table.	NOTE: there is some risk of collision with user
	 * relnames.  Working around this seems more trouble than it's worth; in
	 * particular, we can't create the new heap in a different namespace from
	 * the old, or we will have problems with the TEMP status of temp tables.
586
	 */
587
	snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", tableOid);
588

589
	OIDNewHeap = make_new_heap(tableOid, NewHeapName, tableSpace);
Bruce Momjian's avatar
Bruce Momjian committed
590

Bruce Momjian's avatar
Bruce Momjian committed
591
	/*
592
	 * We don't need CommandCounterIncrement() because make_new_heap did it.
Bruce Momjian's avatar
Bruce Momjian committed
593
	 */
594

595 596 597
	/*
	 * Copy the heap data into the new table in the desired order.
	 */
598
	frozenXid = copy_heap_data(OIDNewHeap, tableOid, indexOid);
599

600
	/* To make the new heap's data visible (probably not needed?). */
601 602
	CommandCounterIncrement();

603
	/* Swap the physical files of the old and new heaps. */
604
	swap_relation_files(tableOid, OIDNewHeap, frozenXid);
605 606

	CommandCounterIncrement();
607

608
	/* Destroy new heap with old filenode */
609
	object.classId = RelationRelationId;
610
	object.objectId = OIDNewHeap;
611
	object.objectSubId = 0;
612

613 614
	/*
	 * The new relation is local to our transaction and we know nothing
615 616
	 * depends on it, so DROP_RESTRICT should be OK.
	 */
617 618 619
	performDeletion(&object, DROP_RESTRICT);

	/* performDeletion does CommandCounterIncrement at end */
Bruce Momjian's avatar
Bruce Momjian committed
620

621
	/*
622 623 624
	 * Rebuild each index on the relation (but not the toast table, which is
	 * all-new at this point).	We do not need CommandCounterIncrement()
	 * because reindex_relation does it.
625
	 */
626
	reindex_relation(tableOid, false);
627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655

	/*
	 * At this point, everything is kosher except that the toast table's name
	 * corresponds to the temporary table.  The name is irrelevant to
	 * the backend because it's referenced by OID, but users looking at the
	 * catalogs could be confused.  Rename it to prevent this problem.
	 *
	 * Note no lock required on the relation, because we already hold an
	 * exclusive lock on it.
	 */
	newrel = heap_open(tableOid, NoLock);
	if (OidIsValid(newrel->rd_rel->reltoastrelid))
	{
		char		NewToastName[NAMEDATALEN];
		Relation	toastrel;

		/* rename the toast table ... */
		snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u", tableOid);
		RenameRelationInternal(newrel->rd_rel->reltoastrelid, NewToastName,
							   PG_TOAST_NAMESPACE);

		/* ... and its index too */
		toastrel = relation_open(newrel->rd_rel->reltoastrelid, AccessShareLock);
		snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index", tableOid);
		RenameRelationInternal(toastrel->rd_rel->reltoastidxid, NewToastName,
							   PG_TOAST_NAMESPACE);
		relation_close(toastrel, AccessShareLock);
	}
	relation_close(newrel, NoLock);
656 657
}

658 659 660
/*
 * Create the new table that we will fill with correctly-ordered data.
 */
661
Oid
662
make_new_heap(Oid OIDOldHeap, const char *NewName, Oid NewTableSpace)
663
{
664 665 666
	TupleDesc	OldHeapDesc,
				tupdesc;
	Oid			OIDNewHeap;
667
	Relation	OldHeap;
668
	HeapTuple	tuple;
669 670
	Datum		reloptions;
	bool		isNull;
671

672
	OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
673
	OldHeapDesc = RelationGetDescr(OldHeap);
674 675

	/*
676
	 * Need to make a copy of the tuple descriptor, since
677 678 679 680
	 * heap_create_with_catalog modifies it.  Note that the NewHeap will
	 * not receive any of the defaults or constraints associated with the
	 * OldHeap; we don't need 'em, and there's no reason to spend cycles
	 * inserting them into the catalogs only to delete them.
681
	 */
682
	tupdesc = CreateTupleDescCopy(OldHeapDesc);
683

684 685 686 687 688 689
	/*
	 * Use options of the old heap for new heap.
	 */
	tuple = SearchSysCache(RELOID,
						   ObjectIdGetDatum(OIDOldHeap),
						   0, 0, 0);
690 691 692 693 694 695
	if (!HeapTupleIsValid(tuple))
		elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
	reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
								 &isNull);
	if (isNull)
		reloptions = (Datum) 0;
696

697 698
	OIDNewHeap = heap_create_with_catalog(NewName,
										  RelationGetNamespace(OldHeap),
Bruce Momjian's avatar
Bruce Momjian committed
699
										  NewTableSpace,
700
										  InvalidOid,
701
										  OldHeap->rd_rel->relowner,
702
										  tupdesc,
703
										  NIL,
704
										  OldHeap->rd_rel->relkind,
705
										  OldHeap->rd_rel->relisshared,
706 707
										  true,
										  0,
708
										  ONCOMMIT_NOOP,
709 710
										  reloptions,
										  allowSystemTableMods);
711 712

	ReleaseSysCache(tuple);
713

714
	/*
715 716
	 * Advance command counter so that the newly-created relation's catalog
	 * tuples will be visible to heap_open.
717 718 719 720
	 */
	CommandCounterIncrement();

	/*
721
	 * If necessary, create a TOAST table for the new relation. Note that
722 723
	 * AlterTableCreateToastTable ends with CommandCounterIncrement(), so that
	 * the TOAST table will be visible for insertion.
724
	 */
725
	AlterTableCreateToastTable(OIDNewHeap);
726

727
	heap_close(OldHeap, NoLock);
728

729
	return OIDNewHeap;
730 731
}

732
/*
733 734
 * Do the physical copying of heap data.  Returns the TransactionId used as
 * freeze cutoff point for the tuples.
735
 */
736
static TransactionId
737
copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex)
738
{
739 740 741
	Relation	NewHeap,
				OldHeap,
				OldIndex;
742 743 744 745
	TupleDesc	oldTupDesc;
	TupleDesc	newTupDesc;
	int			natts;
	Datum	   *values;
746
	bool	   *isnull;
747 748
	IndexScanDesc scan;
	HeapTuple	tuple;
749
	bool		use_wal;
750
	TransactionId OldestXmin;
751
	TransactionId FreezeXid;
752
	RewriteState rwstate;
753 754

	/*
755
	 * Open the relations we need.
756
	 */
757 758
	NewHeap = heap_open(OIDNewHeap, AccessExclusiveLock);
	OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
759
	OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
760

761
	/*
762 763
	 * Their tuple descriptors should be exactly alike, but here we only need
	 * assume that they have the same number of columns.
764 765 766 767 768
	 */
	oldTupDesc = RelationGetDescr(OldHeap);
	newTupDesc = RelationGetDescr(NewHeap);
	Assert(newTupDesc->natts == oldTupDesc->natts);

769
	/* Preallocate values/isnull arrays */
770
	natts = newTupDesc->natts;
771 772
	values = (Datum *) palloc(natts * sizeof(Datum));
	isnull = (bool *) palloc(natts * sizeof(bool));
773

774 775
	/*
	 * We need to log the copied data in WAL iff WAL archiving is enabled AND
776
	 * it's not a temp rel.
777 778 779 780 781 782
	 */
	use_wal = XLogArchivingActive() && !NewHeap->rd_istemp;

	/* use_wal off requires rd_targblock be initially invalid */
	Assert(NewHeap->rd_targblock == InvalidBlockNumber);

783 784
	/*
	 * compute xids used to freeze and weed out dead tuples.  We use -1
Bruce Momjian's avatar
Bruce Momjian committed
785 786
	 * freeze_min_age to avoid having CLUSTER freeze tuples earlier than a
	 * plain VACUUM would.
787 788 789
	 */
	vacuum_set_xid_limits(-1, OldHeap->rd_rel->relisshared,
						  &OldestXmin, &FreezeXid);
790

791 792 793 794 795 796 797
	/*
	 * FreezeXid will become the table's new relfrozenxid, and that mustn't
	 * go backwards, so take the max.
	 */
	if (TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
		FreezeXid = OldHeap->rd_rel->relfrozenxid;

798
	/* Initialize the rewrite operation */
799
	rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid, use_wal);
800

801
	/*
802 803
	 * Scan through the OldHeap in OldIndex order and copy each tuple into the
	 * NewHeap.  To ensure we see recently-dead tuples that still need to be
Bruce Momjian's avatar
Bruce Momjian committed
804 805
	 * copied, we scan with SnapshotAny and use HeapTupleSatisfiesVacuum for
	 * the visibility test.
806
	 */
807
	scan = index_beginscan(OldHeap, OldIndex,
808
						   SnapshotAny, 0, (ScanKey) NULL);
809

810
	while ((tuple = index_getnext(scan, ForwardScanDirection)) != NULL)
811
	{
812 813 814 815 816 817
		HeapTuple	copiedTuple;
		bool		isdead;
		int			i;

		CHECK_FOR_INTERRUPTS();

818 819 820 821
		/* Since we used no scan keys, should never need to recheck */
		if (scan->xs_recheck)
			elog(ERROR, "CLUSTER does not support lossy index conditions");

822 823 824 825 826 827 828 829 830 831 832 833 834 835 836
		LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);

		switch (HeapTupleSatisfiesVacuum(tuple->t_data, OldestXmin,
										 scan->xs_cbuf))
		{
			case HEAPTUPLE_DEAD:
				/* Definitely dead */
				isdead = true;
				break;
			case HEAPTUPLE_LIVE:
			case HEAPTUPLE_RECENTLY_DEAD:
				/* Live or recently dead, must copy it */
				isdead = false;
				break;
			case HEAPTUPLE_INSERT_IN_PROGRESS:
Bruce Momjian's avatar
Bruce Momjian committed
837

838
				/*
Bruce Momjian's avatar
Bruce Momjian committed
839 840
				 * We should not see this unless it's been inserted earlier in
				 * our own transaction.
841 842
				 */
				if (!TransactionIdIsCurrentTransactionId(
Bruce Momjian's avatar
Bruce Momjian committed
843
									  HeapTupleHeaderGetXmin(tuple->t_data)))
844 845 846 847 848
					elog(ERROR, "concurrent insert in progress");
				/* treat as live */
				isdead = false;
				break;
			case HEAPTUPLE_DELETE_IN_PROGRESS:
Bruce Momjian's avatar
Bruce Momjian committed
849

850
				/*
Bruce Momjian's avatar
Bruce Momjian committed
851 852
				 * We should not see this unless it's been deleted earlier in
				 * our own transaction.
853 854 855
				 */
				Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI));
				if (!TransactionIdIsCurrentTransactionId(
Bruce Momjian's avatar
Bruce Momjian committed
856
									  HeapTupleHeaderGetXmax(tuple->t_data)))
857 858 859 860 861 862
					elog(ERROR, "concurrent delete in progress");
				/* treat as recently dead */
				isdead = false;
				break;
			default:
				elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
Bruce Momjian's avatar
Bruce Momjian committed
863
				isdead = false; /* keep compiler quiet */
864 865 866 867 868 869 870 871 872 873 874 875
				break;
		}

		LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);

		if (isdead)
		{
			/* heap rewrite module still needs to see it... */
			rewrite_heap_dead_tuple(rwstate, tuple);
			continue;
		}

876
		/*
877
		 * We cannot simply copy the tuple as-is, for several reasons:
878
		 *
879
		 * 1. We'd like to squeeze out the values of any dropped columns, both
880
		 * to save space and to ensure we have no corner-case failures. (It's
881 882
		 * possible for example that the new table hasn't got a TOAST table
		 * and so is unable to store any large values of dropped cols.)
883
		 *
884
		 * 2. The tuple might not even be legal for the new table; this is
885 886 887 888
		 * currently only known to happen as an after-effect of ALTER TABLE
		 * SET WITHOUT OIDS.
		 *
		 * So, we must reconstruct the tuple from component Datums.
889
		 */
890
		heap_deform_tuple(tuple, oldTupDesc, values, isnull);
891 892 893 894 895

		/* Be sure to null out any dropped columns */
		for (i = 0; i < natts; i++)
		{
			if (newTupDesc->attrs[i]->attisdropped)
896
				isnull[i] = true;
897 898
		}

899
		copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
900 901 902 903

		/* Preserve OID, if any */
		if (NewHeap->rd_rel->relhasoids)
			HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
904

905 906
		/* The heap rewrite module does the rest */
		rewrite_heap_tuple(rwstate, tuple, copiedTuple);
907 908

		heap_freetuple(copiedTuple);
909
	}
910

911
	index_endscan(scan);
912

913 914
	/* Write out any remaining tuples, and fsync if needed */
	end_heap_rewrite(rwstate);
915

916 917
	pfree(values);
	pfree(isnull);
918

919
	index_close(OldIndex, NoLock);
920 921
	heap_close(OldHeap, NoLock);
	heap_close(NewHeap, NoLock);
922 923

	return FreezeXid;
924
}
925

926
/*
927 928 929 930
 * Swap the physical files of two given relations.
 *
 * We swap the physical identity (reltablespace and relfilenode) while
 * keeping the same logical identities of the two relations.
931 932 933
 *
 * Also swap any TOAST links, so that the toast data moves along with
 * the main-table data.
934 935 936 937 938 939 940
 *
 * Additionally, the first relation is marked with relfrozenxid set to
 * frozenXid.  It seems a bit ugly to have this here, but all callers would
 * have to do it anyway, so having it here saves a heap_update.  Note: the
 * TOAST table needs no special handling, because since we swapped the links,
 * the entry for the TOAST table will now contain RecentXmin in relfrozenxid,
 * which is the correct value.
941
 */
942
void
943
swap_relation_files(Oid r1, Oid r2, TransactionId frozenXid)
944
{
945
	Relation	relRelation;
946 947 948 949 950
	HeapTuple	reltup1,
				reltup2;
	Form_pg_class relform1,
				relform2;
	Oid			swaptemp;
951
	CatalogIndexState indstate;
952

953
	/* We need writable copies of both pg_class tuples. */
954
	relRelation = heap_open(RelationRelationId, RowExclusiveLock);
955

956 957 958 959
	reltup1 = SearchSysCacheCopy(RELOID,
								 ObjectIdGetDatum(r1),
								 0, 0, 0);
	if (!HeapTupleIsValid(reltup1))
960
		elog(ERROR, "cache lookup failed for relation %u", r1);
961 962 963 964 965 966
	relform1 = (Form_pg_class) GETSTRUCT(reltup1);

	reltup2 = SearchSysCacheCopy(RELOID,
								 ObjectIdGetDatum(r2),
								 0, 0, 0);
	if (!HeapTupleIsValid(reltup2))
967
		elog(ERROR, "cache lookup failed for relation %u", r2);
968
	relform2 = (Form_pg_class) GETSTRUCT(reltup2);
969

970
	/*
971
	 * Actually swap the fields in the two tuples
972 973 974 975
	 */
	swaptemp = relform1->relfilenode;
	relform1->relfilenode = relform2->relfilenode;
	relform2->relfilenode = swaptemp;
976

977 978 979 980
	swaptemp = relform1->reltablespace;
	relform1->reltablespace = relform2->reltablespace;
	relform2->reltablespace = swaptemp;

981 982 983
	swaptemp = relform1->reltoastrelid;
	relform1->reltoastrelid = relform2->reltoastrelid;
	relform2->reltoastrelid = swaptemp;
984

985
	/* we should not swap reltoastidxid */
986

987 988 989 990
	/* set rel1's frozen Xid */
	Assert(TransactionIdIsNormal(frozenXid));
	relform1->relfrozenxid = frozenXid;

991 992
	/* swap size statistics too, since new rel has freshly-updated stats */
	{
Bruce Momjian's avatar
Bruce Momjian committed
993 994
		int4		swap_pages;
		float4		swap_tuples;
995 996 997 998 999 1000 1001 1002 1003 1004

		swap_pages = relform1->relpages;
		relform1->relpages = relform2->relpages;
		relform2->relpages = swap_pages;

		swap_tuples = relform1->reltuples;
		relform1->reltuples = relform2->reltuples;
		relform2->reltuples = swap_tuples;
	}

1005 1006 1007
	/* Update the tuples in pg_class */
	simple_heap_update(relRelation, &reltup1->t_self, reltup1);
	simple_heap_update(relRelation, &reltup2->t_self, reltup2);
Bruce Momjian's avatar
Bruce Momjian committed
1008

1009
	/* Keep system catalogs current */
1010
	indstate = CatalogOpenIndexes(relRelation);
1011 1012
	CatalogIndexInsert(indstate, reltup1);
	CatalogIndexInsert(indstate, reltup2);
1013
	CatalogCloseIndexes(indstate);
1014

1015
	/*
1016 1017 1018
	 * If we have toast tables associated with the relations being swapped,
	 * change their dependency links to re-associate them with their new
	 * owning relations.  Otherwise the wrong one will get dropped ...
1019
	 *
Bruce Momjian's avatar
Bruce Momjian committed
1020
	 * NOTE: it is possible that only one table has a toast table; this can
1021 1022
	 * happen in CLUSTER if there were dropped columns in the old table, and
	 * in ALTER TABLE when adding or changing type of columns.
1023
	 *
1024 1025 1026 1027
	 * NOTE: at present, a TOAST table's only dependency is the one on its
	 * owning table.  If more are ever created, we'd need to use something
	 * more selective than deleteDependencyRecordsFor() to get rid of only the
	 * link we want.
1028 1029 1030 1031 1032 1033 1034 1035
	 */
	if (relform1->reltoastrelid || relform2->reltoastrelid)
	{
		ObjectAddress baseobject,
					toastobject;
		long		count;

		/* Delete old dependencies */
1036 1037
		if (relform1->reltoastrelid)
		{
1038
			count = deleteDependencyRecordsFor(RelationRelationId,
1039 1040 1041 1042 1043 1044 1045
											   relform1->reltoastrelid);
			if (count != 1)
				elog(ERROR, "expected one dependency record for TOAST table, found %ld",
					 count);
		}
		if (relform2->reltoastrelid)
		{
1046
			count = deleteDependencyRecordsFor(RelationRelationId,
1047 1048 1049 1050 1051
											   relform2->reltoastrelid);
			if (count != 1)
				elog(ERROR, "expected one dependency record for TOAST table, found %ld",
					 count);
		}
1052 1053

		/* Register new dependencies */
1054
		baseobject.classId = RelationRelationId;
1055
		baseobject.objectSubId = 0;
1056
		toastobject.classId = RelationRelationId;
1057 1058
		toastobject.objectSubId = 0;

1059 1060 1061 1062 1063 1064
		if (relform1->reltoastrelid)
		{
			baseobject.objectId = r1;
			toastobject.objectId = relform1->reltoastrelid;
			recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL);
		}
1065

1066 1067 1068 1069 1070 1071
		if (relform2->reltoastrelid)
		{
			baseobject.objectId = r2;
			toastobject.objectId = relform2->reltoastrelid;
			recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL);
		}
1072 1073 1074
	}

	/*
Bruce Momjian's avatar
Bruce Momjian committed
1075
	 * Blow away the old relcache entries now.	We need this kluge because
1076 1077 1078 1079 1080 1081 1082 1083 1084 1085
	 * relcache.c keeps a link to the smgr relation for the physical file, and
	 * that will be out of date as soon as we do CommandCounterIncrement.
	 * Whichever of the rels is the second to be cleared during cache
	 * invalidation will have a dangling reference to an already-deleted smgr
	 * relation.  Rather than trying to avoid this by ordering operations just
	 * so, it's easiest to not have the relcache entries there at all.
	 * (Fortunately, since one of the entries is local in our transaction,
	 * it's sufficient to clear out our own relcache this way; the problem
	 * cannot arise for other backends when they see our update on the
	 * non-local relation.)
1086 1087 1088 1089 1090 1091 1092 1093 1094
	 */
	RelationForgetRelation(r1);
	RelationForgetRelation(r2);

	/* Clean up. */
	heap_freetuple(reltup1);
	heap_freetuple(reltup2);

	heap_close(relRelation, RowExclusiveLock);
1095
}
1096

1097 1098
/*
 * Get a list of tables that the current user owns and
1099
 * have indisclustered set.  Return the list in a List * of rvsToCluster
1100
 * with the tableOid and the indexOid on which the table is already
1101 1102
 * clustered.
 */
1103 1104
static List *
get_tables_to_cluster(MemoryContext cluster_context)
1105
{
Bruce Momjian's avatar
Bruce Momjian committed
1106 1107 1108 1109 1110 1111 1112 1113
	Relation	indRelation;
	HeapScanDesc scan;
	ScanKeyData entry;
	HeapTuple	indexTuple;
	Form_pg_index index;
	MemoryContext old_context;
	RelToCluster *rvtc;
	List	   *rvs = NIL;
1114 1115

	/*
1116
	 * Get all indexes that have indisclustered set and are owned by
1117 1118 1119
	 * appropriate user. System relations or nailed-in relations cannot ever
	 * have indisclustered set, because CLUSTER will refuse to set it when
	 * called with one of them as argument.
1120
	 */
1121
	indRelation = heap_open(IndexRelationId, AccessShareLock);
1122 1123 1124 1125
	ScanKeyInit(&entry,
				Anum_pg_index_indisclustered,
				BTEqualStrategyNumber, F_BOOLEQ,
				BoolGetDatum(true));
1126 1127 1128 1129
	scan = heap_beginscan(indRelation, SnapshotNow, 1, &entry);
	while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
	{
		index = (Form_pg_index) GETSTRUCT(indexTuple);
1130 1131

		if (!pg_class_ownercheck(index->indrelid, GetUserId()))
1132 1133 1134
			continue;

		/*
1135 1136
		 * We have to build the list in a different memory context so it will
		 * survive the cross-transaction processing
1137 1138 1139
		 */
		old_context = MemoryContextSwitchTo(cluster_context);

1140
		rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1141
		rvtc->tableOid = index->indrelid;
1142 1143
		rvtc->indexOid = index->indexrelid;
		rvs = lcons(rvtc, rvs);
1144 1145 1146 1147 1148

		MemoryContextSwitchTo(old_context);
	}
	heap_endscan(scan);

1149 1150
	relation_close(indRelation, AccessShareLock);

1151 1152
	return rvs;
}