heapam.c 133 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * heapam.c
4
 *	  heap access method code
5
 *
6
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
Bruce Momjian's avatar
Add:  
Bruce Momjian committed
7
 * Portions Copyright (c) 1994, Regents of the University of California
8 9 10
 *
 *
 * IDENTIFICATION
11
 *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.244 2007/11/07 12:24:24 petere Exp $
12 13 14
 *
 *
 * INTERFACE ROUTINES
15
 *		relation_open	- open any relation by relation OID
Bruce Momjian's avatar
Bruce Momjian committed
16
 *		relation_openrv - open any relation specified by a RangeVar
17 18
 *		relation_close	- close any relation
 *		heap_open		- open a heap relation by relation OID
19
 *		heap_openrv		- open a heap relation specified by a RangeVar
20
 *		heap_close		- (now just a macro for relation_close)
21 22 23 24
 *		heap_beginscan	- begin relation scan
 *		heap_rescan		- restart a relation scan
 *		heap_endscan	- end relation scan
 *		heap_getnext	- retrieve next tuple in scan
25
 *		heap_fetch		- retrieve tuple with given tid
26 27
 *		heap_insert		- insert tuple into a relation
 *		heap_delete		- delete a tuple from a relation
28
 *		heap_update		- replace a tuple in a relation with another tuple
29 30
 *		heap_markpos	- mark scan position
 *		heap_restrpos	- restore position to marked location
31
 *		heap_sync		- sync heap, for when no WAL has been written
32
 *
33
 * NOTES
34 35 36
 *	  This file contains the heap_ routines which implement
 *	  the POSTGRES heap access method used for all POSTGRES
 *	  relations.
37 38 39
 *
 *-------------------------------------------------------------------------
 */
40
#include "postgres.h"
41

42 43
#include "access/heapam.h"
#include "access/hio.h"
44
#include "access/multixact.h"
45
#include "access/transam.h"
Tom Lane's avatar
Tom Lane committed
46
#include "access/tuptoaster.h"
Bruce Momjian's avatar
Bruce Momjian committed
47
#include "access/valid.h"
48
#include "access/xact.h"
49
#include "catalog/catalog.h"
50
#include "catalog/namespace.h"
Bruce Momjian's avatar
Bruce Momjian committed
51
#include "miscadmin.h"
52
#include "pgstat.h"
53
#include "storage/procarray.h"
54
#include "storage/smgr.h"
55
#include "utils/datum.h"
Bruce Momjian's avatar
Bruce Momjian committed
56
#include "utils/inval.h"
57
#include "utils/lsyscache.h"
Bruce Momjian's avatar
Bruce Momjian committed
58
#include "utils/relcache.h"
59
#include "utils/syscache.h"
60

61

62 63 64 65
static HeapScanDesc heap_beginscan_internal(Relation relation,
											Snapshot snapshot,
											int nkeys, ScanKey key,
											bool is_bitmapscan);
66
static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
67
		   ItemPointerData from, Buffer newbuf, HeapTuple newtup, bool move);
68 69
static bool HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
					   HeapTuple oldtup, HeapTuple newtup);
70

Marc G. Fournier's avatar
Marc G. Fournier committed
71

72
/* ----------------------------------------------------------------
73
 *						 heap support routines
74 75 76 77
 * ----------------------------------------------------------------
 */

/* ----------------
78
 *		initscan - scan code common to heap_beginscan and heap_rescan
79 80 81
 * ----------------
 */
static void
82
initscan(HeapScanDesc scan, ScanKey key)
83
{
84
	/*
85 86
	 * Determine the number of blocks we have to scan.
	 *
Bruce Momjian's avatar
Bruce Momjian committed
87
	 * It is sufficient to do this once at scan start, since any tuples added
88 89 90 91 92 93
	 * while the scan is in progress will be invisible to my snapshot
	 * anyway.  (That is not true when using a non-MVCC snapshot.  However,
	 * we couldn't guarantee to return tuples added after scan start anyway,
	 * since they might go into pages we already scanned.  To guarantee
	 * consistent results for a non-MVCC snapshot, the caller must hold some
	 * higher-level lock that ensures the interesting tuple(s) won't change.)
94
	 */
95
	scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
96

97 98
	/*
	 * If the table is large relative to NBuffers, use a bulk-read access
99 100 101 102 103
	 * strategy and enable synchronized scanning (see syncscan.c).  Although
	 * the thresholds for these features could be different, we make them the
	 * same so that there are only two behaviors to tune rather than four.
	 *
	 * During a rescan, don't make a new strategy object if we don't have to.
104
	 */
105 106 107
	if (!scan->rs_bitmapscan &&
		!scan->rs_rd->rd_istemp &&
		scan->rs_nblocks > NBuffers / 4)
108 109 110
	{
		if (scan->rs_strategy == NULL)
			scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
111 112 113

		scan->rs_syncscan = true;
		scan->rs_startblock = ss_get_location(scan->rs_rd, scan->rs_nblocks);
114 115 116 117 118 119
	}
	else
	{
		if (scan->rs_strategy != NULL)
			FreeAccessStrategy(scan->rs_strategy);
		scan->rs_strategy = NULL;
120 121 122

		scan->rs_syncscan = false;
		scan->rs_startblock = 0;
123 124
	}

125
	scan->rs_inited = false;
126
	scan->rs_ctup.t_data = NULL;
127
	ItemPointerSetInvalid(&scan->rs_ctup.t_self);
128
	scan->rs_cbuf = InvalidBuffer;
129
	scan->rs_cblock = InvalidBlockNumber;
130 131

	/* we don't have a marked position... */
132
	ItemPointerSetInvalid(&(scan->rs_mctid));
133

134 135
	/* page-at-a-time fields are always invalid when not rs_inited */

136 137
	/*
	 * copy the scan key, if appropriate
138
	 */
139
	if (key != NULL)
140
		memcpy(scan->rs_key, key, scan->rs_nkeys * sizeof(ScanKeyData));
141

142 143 144 145 146 147
	/*
	 * Currently, we don't have a stats counter for bitmap heap scans
	 * (but the underlying bitmap index scans will be counted).
	 */
	if (!scan->rs_bitmapscan)
		pgstat_count_heap_scan(scan->rs_rd);
148 149
}

150 151
/*
 * heapgetpage - subroutine for heapgettup()
152
 *
153 154 155
 * This routine reads and pins the specified page of the relation.
 * In page-at-a-time mode it performs additional work, namely determining
 * which tuples on the page are visible.
156
 */
157
static void
158
heapgetpage(HeapScanDesc scan, BlockNumber page)
159
{
160 161
	Buffer		buffer;
	Snapshot	snapshot;
Bruce Momjian's avatar
Bruce Momjian committed
162 163
	Page		dp;
	int			lines;
164
	int			ntup;
Bruce Momjian's avatar
Bruce Momjian committed
165
	OffsetNumber lineoff;
166 167 168
	ItemId		lpp;

	Assert(page < scan->rs_nblocks);
169

170 171 172 173 174 175 176 177 178 179 180
	/* release previous scan buffer, if any */
	if (BufferIsValid(scan->rs_cbuf))
	{
		ReleaseBuffer(scan->rs_cbuf);
		scan->rs_cbuf = InvalidBuffer;
	}

	/* read page using selected strategy */
	scan->rs_cbuf = ReadBufferWithStrategy(scan->rs_rd,
										   page,
										   scan->rs_strategy);
181 182 183 184 185 186 187
	scan->rs_cblock = page;

	if (!scan->rs_pageatatime)
		return;

	buffer = scan->rs_cbuf;
	snapshot = scan->rs_snapshot;
188

189 190 191 192 193
	/*
	 * Prune and repair fragmentation for the whole page, if possible.
	 */
	heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);

194
	/*
Bruce Momjian's avatar
Bruce Momjian committed
195 196 197
	 * We must hold share lock on the buffer content while examining tuple
	 * visibility.	Afterwards, however, the tuples we have found to be
	 * visible are guaranteed good as long as we hold the buffer pin.
198
	 */
199
	LockBuffer(buffer, BUFFER_LOCK_SHARE);
200

201 202 203
	dp = (Page) BufferGetPage(buffer);
	lines = PageGetMaxOffsetNumber(dp);
	ntup = 0;
204

205 206 207
	for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
		 lineoff <= lines;
		 lineoff++, lpp++)
208
	{
209
		if (ItemIdIsNormal(lpp))
210 211 212 213 214 215 216 217 218 219 220 221
		{
			HeapTupleData loctup;
			bool		valid;

			loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
			loctup.t_len = ItemIdGetLength(lpp);
			ItemPointerSet(&(loctup.t_self), page, lineoff);

			valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
			if (valid)
				scan->rs_vistuples[ntup++] = lineoff;
		}
222
	}
223

224
	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
225

226 227 228 229 230 231 232 233 234 235 236
	Assert(ntup <= MaxHeapTuplesPerPage);
	scan->rs_ntuples = ntup;
}

/* ----------------
 *		heapgettup - fetch next heap tuple
 *
 *		Initialize the scan if not already done; then advance to the next
 *		tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
 *		or set scan->rs_ctup.t_data = NULL if no more tuples.
 *
237 238
 * dir == NoMovementScanDirection means "re-fetch the tuple indicated
 * by scan->rs_ctup".
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
 *
 * Note: the reason nkeys/key are passed separately, even though they are
 * kept in the scan descriptor, is that the caller may not want us to check
 * the scankeys.
 *
 * Note: when we fall off the end of the scan in either direction, we
 * reset rs_inited.  This means that a further request with the same
 * scan direction will restart the scan, which is a bit odd, but a
 * request with the opposite scan direction will start a fresh scan
 * in the proper direction.  The latter is required behavior for cursors,
 * while the former case is generally undefined behavior in Postgres
 * so we don't care too much.
 * ----------------
 */
static void
heapgettup(HeapScanDesc scan,
255
		   ScanDirection dir,
256 257 258 259 260
		   int nkeys,
		   ScanKey key)
{
	HeapTuple	tuple = &(scan->rs_ctup);
	Snapshot	snapshot = scan->rs_snapshot;
261
	bool		backward = ScanDirectionIsBackward(dir);
262
	BlockNumber page;
263
	bool		finished;
264 265 266 267 268 269
	Page		dp;
	int			lines;
	OffsetNumber lineoff;
	int			linesleft;
	ItemId		lpp;

270 271
	/*
	 * calculate next starting lineoff, given scan direction
272
	 */
273
	if (ScanDirectionIsForward(dir))
274
	{
275
		if (!scan->rs_inited)
276
		{
277 278 279 280 281 282 283 284 285
			/*
			 * return null immediately if relation is empty
			 */
			if (scan->rs_nblocks == 0)
			{
				Assert(!BufferIsValid(scan->rs_cbuf));
				tuple->t_data = NULL;
				return;
			}
286
			page = scan->rs_startblock;			/* first page */
287 288 289 290 291 292 293
			heapgetpage(scan, page);
			lineoff = FirstOffsetNumber;		/* first offnum */
			scan->rs_inited = true;
		}
		else
		{
			/* continue from previously returned page/tuple */
Bruce Momjian's avatar
Bruce Momjian committed
294 295
			page = scan->rs_cblock;		/* current page */
			lineoff =			/* next offnum */
296
				OffsetNumberNext(ItemPointerGetOffsetNumber(&(tuple->t_self)));
297
		}
298

299
		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
300

301
		dp = (Page) BufferGetPage(scan->rs_cbuf);
302 303
		lines = PageGetMaxOffsetNumber(dp);
		/* page and lineoff now reference the physically next tid */
304

305
		linesleft = lines - lineoff + 1;
306
	}
307
	else if (backward)
308
	{
309
		if (!scan->rs_inited)
310 311 312 313 314 315 316 317 318 319
		{
			/*
			 * return null immediately if relation is empty
			 */
			if (scan->rs_nblocks == 0)
			{
				Assert(!BufferIsValid(scan->rs_cbuf));
				tuple->t_data = NULL;
				return;
			}
320 321 322 323 324 325 326
			/*
			 * Disable reporting to syncscan logic in a backwards scan; it's
			 * not very likely anyone else is doing the same thing at the same
			 * time, and much more likely that we'll just bollix things for
			 * forward scanners.
			 */
			scan->rs_syncscan = false;
327
			/* start from last page of the scan */
328 329 330 331
			if (scan->rs_startblock > 0)
				page = scan->rs_startblock - 1;
			else
				page = scan->rs_nblocks - 1;
332
			heapgetpage(scan, page);
333 334 335 336
		}
		else
		{
			/* continue from previously returned page/tuple */
Bruce Momjian's avatar
Bruce Momjian committed
337
			page = scan->rs_cblock;		/* current page */
338
		}
339

340
		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
341

342
		dp = (Page) BufferGetPage(scan->rs_cbuf);
343
		lines = PageGetMaxOffsetNumber(dp);
344 345

		if (!scan->rs_inited)
346
		{
Bruce Momjian's avatar
Bruce Momjian committed
347
			lineoff = lines;	/* final offnum */
348
			scan->rs_inited = true;
349
		}
350 351
		else
		{
Bruce Momjian's avatar
Bruce Momjian committed
352
			lineoff =			/* previous offnum */
353
				OffsetNumberPrev(ItemPointerGetOffsetNumber(&(tuple->t_self)));
354 355
		}
		/* page and lineoff now reference the physically previous tid */
356 357

		linesleft = lineoff;
358 359 360
	}
	else
	{
361
		/*
362
		 * ``no movement'' scan direction: refetch prior tuple
363
		 */
364
		if (!scan->rs_inited)
365
		{
366 367 368
			Assert(!BufferIsValid(scan->rs_cbuf));
			tuple->t_data = NULL;
			return;
369 370
		}

371
		page = ItemPointerGetBlockNumber(&(tuple->t_self));
372 373
		if (page != scan->rs_cblock)
			heapgetpage(scan, page);
374

375
		/* Since the tuple was previously fetched, needn't lock page here */
376
		dp = (Page) BufferGetPage(scan->rs_cbuf);
377 378
		lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
		lpp = PageGetItemId(dp, lineoff);
379
		Assert(ItemIdIsNormal(lpp));
380

381 382
		tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
		tuple->t_len = ItemIdGetLength(lpp);
383

384 385
		return;
	}
386

387
	/*
388 389
	 * advance the scan until we find a qualifying tuple or run out of stuff
	 * to scan
390
	 */
391
	lpp = PageGetItemId(dp, lineoff);
392 393
	for (;;)
	{
394
		while (linesleft > 0)
395
		{
396
			if (ItemIdIsNormal(lpp))
397
			{
Bruce Momjian's avatar
Bruce Momjian committed
398
				bool		valid;
399

400 401 402
				tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
				tuple->t_len = ItemIdGetLength(lpp);
				ItemPointerSet(&(tuple->t_self), page, lineoff);
403 404 405

				/*
				 * if current tuple qualifies, return it.
406
				 */
407 408 409 410 411 412 413 414
				valid = HeapTupleSatisfiesVisibility(tuple,
													 snapshot,
													 scan->rs_cbuf);

				if (valid && key != NULL)
					HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
								nkeys, key, valid);

415
				if (valid)
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
416
				{
417
					LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
418
					return;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
419
				}
420 421
			}

422 423
			/*
			 * otherwise move to the next item on the page
424 425
			 */
			--linesleft;
426
			if (backward)
427 428
			{
				--lpp;			/* move back in this page's ItemId array */
429
				--lineoff;
430 431 432
			}
			else
			{
433
				++lpp;			/* move forward in this page's ItemId array */
434
				++lineoff;
435 436 437
			}
		}

438
		/*
439 440
		 * if we get here, it means we've exhausted the items on this page and
		 * it's time to move to the next.
441
		 */
442
		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
443

444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476
		/*
		 * advance to next/prior page and detect end of scan
		 */
		if (backward)
		{
			finished = (page == scan->rs_startblock);
			if (page == 0)
				page = scan->rs_nblocks;
			page--;
		}
		else
		{
			page++;
			if (page >= scan->rs_nblocks)
				page = 0;
			finished = (page == scan->rs_startblock);

			/*
			 * Report our new scan position for synchronization purposes.
			 * We don't do that when moving backwards, however. That would
			 * just mess up any other forward-moving scanners.
			 *
			 * Note: we do this before checking for end of scan so that the
			 * final state of the position hint is back at the start of the
			 * rel.  That's not strictly necessary, but otherwise when you run
			 * the same query multiple times the starting position would shift
			 * a little bit backwards on every invocation, which is confusing.
			 * We don't guarantee any specific ordering in general, though.
			 */
			if (scan->rs_syncscan)
				ss_report_location(scan->rs_rd, page);
		}

477
		/*
478
		 * return NULL if we've exhausted all the pages
479
		 */
480
		if (finished)
481
		{
482 483 484 485
			if (BufferIsValid(scan->rs_cbuf))
				ReleaseBuffer(scan->rs_cbuf);
			scan->rs_cbuf = InvalidBuffer;
			scan->rs_cblock = InvalidBlockNumber;
486
			tuple->t_data = NULL;
487
			scan->rs_inited = false;
488
			return;
489 490
		}

491
		heapgetpage(scan, page);
492

493
		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
494

495
		dp = (Page) BufferGetPage(scan->rs_cbuf);
496
		lines = PageGetMaxOffsetNumber((Page) dp);
497
		linesleft = lines;
498
		if (backward)
499 500 501 502
		{
			lineoff = lines;
			lpp = PageGetItemId(dp, lines);
		}
503
		else
504 505
		{
			lineoff = FirstOffsetNumber;
506
			lpp = PageGetItemId(dp, FirstOffsetNumber);
507
		}
508 509 510
	}
}

511 512 513 514 515 516 517 518 519 520 521 522 523 524 525
/* ----------------
 *		heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
 *
 *		Same API as heapgettup, but used in page-at-a-time mode
 *
 * The internal logic is much the same as heapgettup's too, but there are some
 * differences: we do not take the buffer content lock (that only needs to
 * happen inside heapgetpage), and we iterate through just the tuples listed
 * in rs_vistuples[] rather than all tuples on the page.  Notice that
 * lineindex is 0-based, where the corresponding loop variable lineoff in
 * heapgettup is 1-based.
 * ----------------
 */
static void
heapgettup_pagemode(HeapScanDesc scan,
526
					ScanDirection dir,
527 528 529 530
					int nkeys,
					ScanKey key)
{
	HeapTuple	tuple = &(scan->rs_ctup);
531
	bool		backward = ScanDirectionIsBackward(dir);
532
	BlockNumber page;
533
	bool		finished;
534 535 536 537 538 539 540 541 542 543
	Page		dp;
	int			lines;
	int			lineindex;
	OffsetNumber lineoff;
	int			linesleft;
	ItemId		lpp;

	/*
	 * calculate next starting lineindex, given scan direction
	 */
544
	if (ScanDirectionIsForward(dir))
545 546 547
	{
		if (!scan->rs_inited)
		{
548 549 550 551 552 553 554 555 556
			/*
			 * return null immediately if relation is empty
			 */
			if (scan->rs_nblocks == 0)
			{
				Assert(!BufferIsValid(scan->rs_cbuf));
				tuple->t_data = NULL;
				return;
			}
557
			page = scan->rs_startblock;			/* first page */
558
			heapgetpage(scan, page);
559 560 561 562 563 564
			lineindex = 0;
			scan->rs_inited = true;
		}
		else
		{
			/* continue from previously returned page/tuple */
Bruce Momjian's avatar
Bruce Momjian committed
565
			page = scan->rs_cblock;		/* current page */
566 567
			lineindex = scan->rs_cindex + 1;
		}
568 569

		dp = (Page) BufferGetPage(scan->rs_cbuf);
570 571
		lines = scan->rs_ntuples;
		/* page and lineindex now reference the next visible tid */
572

573
		linesleft = lines - lineindex;
574
	}
575
	else if (backward)
576 577
	{
		if (!scan->rs_inited)
578 579 580 581 582 583 584 585 586 587
		{
			/*
			 * return null immediately if relation is empty
			 */
			if (scan->rs_nblocks == 0)
			{
				Assert(!BufferIsValid(scan->rs_cbuf));
				tuple->t_data = NULL;
				return;
			}
588 589 590 591 592 593 594
			/*
			 * Disable reporting to syncscan logic in a backwards scan; it's
			 * not very likely anyone else is doing the same thing at the same
			 * time, and much more likely that we'll just bollix things for
			 * forward scanners.
			 */
			scan->rs_syncscan = false;
595
			/* start from last page of the scan */
596 597 598 599
			if (scan->rs_startblock > 0)
				page = scan->rs_startblock - 1;
			else
				page = scan->rs_nblocks - 1;
600
			heapgetpage(scan, page);
601 602 603 604
		}
		else
		{
			/* continue from previously returned page/tuple */
Bruce Momjian's avatar
Bruce Momjian committed
605
			page = scan->rs_cblock;		/* current page */
606
		}
607 608 609 610 611 612 613 614 615 616 617 618 619 620

		dp = (Page) BufferGetPage(scan->rs_cbuf);
		lines = scan->rs_ntuples;

		if (!scan->rs_inited)
		{
			lineindex = lines - 1;
			scan->rs_inited = true;
		}
		else
		{
			lineindex = scan->rs_cindex - 1;
		}
		/* page and lineindex now reference the previous visible tid */
621 622

		linesleft = lineindex + 1;
623 624 625 626
	}
	else
	{
		/*
627
		 * ``no movement'' scan direction: refetch prior tuple
628 629 630
		 */
		if (!scan->rs_inited)
		{
631 632 633
			Assert(!BufferIsValid(scan->rs_cbuf));
			tuple->t_data = NULL;
			return;
634 635
		}

636
		page = ItemPointerGetBlockNumber(&(tuple->t_self));
637 638 639
		if (page != scan->rs_cblock)
			heapgetpage(scan, page);

640
		/* Since the tuple was previously fetched, needn't lock page here */
641
		dp = (Page) BufferGetPage(scan->rs_cbuf);
642 643
		lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
		lpp = PageGetItemId(dp, lineoff);
644
		Assert(ItemIdIsNormal(lpp));
645

646 647
		tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
		tuple->t_len = ItemIdGetLength(lpp);
648

649 650 651 652 653 654
		/* check that rs_cindex is in sync */
		Assert(scan->rs_cindex < scan->rs_ntuples);
		Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);

		return;
	}
655 656 657 658 659 660 661 662 663 664 665

	/*
	 * advance the scan until we find a qualifying tuple or run out of stuff
	 * to scan
	 */
	for (;;)
	{
		while (linesleft > 0)
		{
			lineoff = scan->rs_vistuples[lineindex];
			lpp = PageGetItemId(dp, lineoff);
666
			Assert(ItemIdIsNormal(lpp));
667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696

			tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
			tuple->t_len = ItemIdGetLength(lpp);
			ItemPointerSet(&(tuple->t_self), page, lineoff);

			/*
			 * if current tuple qualifies, return it.
			 */
			if (key != NULL)
			{
				bool		valid;

				HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
							nkeys, key, valid);
				if (valid)
				{
					scan->rs_cindex = lineindex;
					return;
				}
			}
			else
			{
				scan->rs_cindex = lineindex;
				return;
			}

			/*
			 * otherwise move to the next item on the page
			 */
			--linesleft;
697
			if (backward)
698 699 700 701 702 703 704 705 706
				--lineindex;
			else
				++lineindex;
		}

		/*
		 * if we get here, it means we've exhausted the items on this page and
		 * it's time to move to the next.
		 */
707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735
		if (backward)
		{
			finished = (page == scan->rs_startblock);
			if (page == 0)
				page = scan->rs_nblocks;
			page--;
		}
		else
		{
			page++;
			if (page >= scan->rs_nblocks)
				page = 0;
			finished = (page == scan->rs_startblock);

			/*
			 * Report our new scan position for synchronization purposes.
			 * We don't do that when moving backwards, however. That would
			 * just mess up any other forward-moving scanners.
			 *
			 * Note: we do this before checking for end of scan so that the
			 * final state of the position hint is back at the start of the
			 * rel.  That's not strictly necessary, but otherwise when you run
			 * the same query multiple times the starting position would shift
			 * a little bit backwards on every invocation, which is confusing.
			 * We don't guarantee any specific ordering in general, though.
			 */
			if (scan->rs_syncscan)
				ss_report_location(scan->rs_rd, page);
		}
736 737 738 739

		/*
		 * return NULL if we've exhausted all the pages
		 */
740
		if (finished)
741 742 743 744 745 746 747 748 749 750 751 752 753 754 755
		{
			if (BufferIsValid(scan->rs_cbuf))
				ReleaseBuffer(scan->rs_cbuf);
			scan->rs_cbuf = InvalidBuffer;
			scan->rs_cblock = InvalidBlockNumber;
			tuple->t_data = NULL;
			scan->rs_inited = false;
			return;
		}

		heapgetpage(scan, page);

		dp = (Page) BufferGetPage(scan->rs_cbuf);
		lines = scan->rs_ntuples;
		linesleft = lines;
756
		if (backward)
757 758 759 760 761 762
			lineindex = lines - 1;
		else
			lineindex = 0;
	}
}

763

764 765 766 767 768 769 770 771 772 773 774 775 776 777 778
#if defined(DISABLE_COMPLEX_MACRO)
/*
 * This is formatted so oddly so that the correspondence to the macro
 * definition in access/heapam.h is maintained.
 */
Datum
fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
			bool *isnull)
{
	return (
			(attnum) > 0 ?
			(
			 ((isnull) ? (*(isnull) = false) : (dummyret) NULL),
			 HeapTupleNoNulls(tup) ?
			 (
779
			  (tupleDesc)->attrs[(attnum) - 1]->attcacheoff >= 0 ?
780
			  (
781 782 783
			   fetchatt((tupleDesc)->attrs[(attnum) - 1],
						(char *) (tup)->t_data + (tup)->t_data->t_hoff +
						(tupleDesc)->attrs[(attnum) - 1]->attcacheoff)
784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804
			   )
			  :
			  nocachegetattr((tup), (attnum), (tupleDesc), (isnull))
			  )
			 :
			 (
			  att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
			  (
			   ((isnull) ? (*(isnull) = true) : (dummyret) NULL),
			   (Datum) NULL
			   )
			  :
			  (
			   nocachegetattr((tup), (attnum), (tupleDesc), (isnull))
			   )
			  )
			 )
			:
			(
			 (Datum) NULL
			 )
805
		);
806
}
807
#endif   /* defined(DISABLE_COMPLEX_MACRO) */
808 809


810
/* ----------------------------------------------------------------
811
 *					 heap access method interface
812 813
 * ----------------------------------------------------------------
 */
814

815
/* ----------------
816
 *		relation_open - open any relation by relation OID
817
 *
818
 *		If lockmode is not "NoLock", the specified kind of lock is
819 820 821 822
 *		obtained on the relation.  (Generally, NoLock should only be
 *		used if the caller knows it has some appropriate lock on the
 *		relation already.)
 *
823
 *		An error is raised if the relation does not exist.
824 825 826
 *
 *		NB: a "relation" is anything with a pg_class entry.  The caller is
 *		expected to check whether the relkind is something it can handle.
827 828 829
 * ----------------
 */
Relation
830
relation_open(Oid relationId, LOCKMODE lockmode)
831
{
832
	Relation	r;
833

834 835
	Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);

836 837 838 839
	/* Get the lock before trying to open the relcache entry */
	if (lockmode != NoLock)
		LockRelationOid(relationId, lockmode);

840 841
	/* The relcache does all the real work... */
	r = RelationIdGetRelation(relationId);
842

843
	if (!RelationIsValid(r))
844
		elog(ERROR, "could not open relation with OID %u", relationId);
845

846 847
	pgstat_initstats(r);

848
	return r;
849 850
}

851
/* ----------------
852
 *		try_relation_open - open any relation by relation OID
853
 *
854 855
 *		Same as relation_open, except return NULL instead of failing
 *		if the relation does not exist.
856 857
 * ----------------
 */
858
Relation
859 860 861 862 863 864 865 866 867 868 869
try_relation_open(Oid relationId, LOCKMODE lockmode)
{
	Relation	r;

	Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);

	/* Get the lock first */
	if (lockmode != NoLock)
		LockRelationOid(relationId, lockmode);

	/*
Bruce Momjian's avatar
Bruce Momjian committed
870 871
	 * Now that we have the lock, probe to see if the relation really exists
	 * or not.
872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889
	 */
	if (!SearchSysCacheExists(RELOID,
							  ObjectIdGetDatum(relationId),
							  0, 0, 0))
	{
		/* Release useless lock */
		if (lockmode != NoLock)
			UnlockRelationOid(relationId, lockmode);

		return NULL;
	}

	/* Should be safe to do a relcache load */
	r = RelationIdGetRelation(relationId);

	if (!RelationIsValid(r))
		elog(ERROR, "could not open relation with OID %u", relationId);

890 891
	pgstat_initstats(r);

892 893 894 895 896 897 898 899 900 901 902 903
	return r;
}

/* ----------------
 *		relation_open_nowait - open but don't wait for lock
 *
 *		Same as relation_open, except throw an error instead of waiting
 *		when the requested lock is not immediately obtainable.
 * ----------------
 */
Relation
relation_open_nowait(Oid relationId, LOCKMODE lockmode)
904 905 906 907 908
{
	Relation	r;

	Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);

909
	/* Get the lock before trying to open the relcache entry */
910 911
	if (lockmode != NoLock)
	{
912
		if (!ConditionalLockRelationOid(relationId, lockmode))
913
		{
914
			/* try to throw error by name; relation could be deleted... */
Bruce Momjian's avatar
Bruce Momjian committed
915
			char	   *relname = get_rel_name(relationId);
916 917 918 919 920 921 922 923 924

			if (relname)
				ereport(ERROR,
						(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
						 errmsg("could not obtain lock on relation \"%s\"",
								relname)));
			else
				ereport(ERROR,
						(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
Bruce Momjian's avatar
Bruce Momjian committed
925 926
					  errmsg("could not obtain lock on relation with OID %u",
							 relationId)));
927 928 929
		}
	}

930 931 932 933 934 935
	/* The relcache does all the real work... */
	r = RelationIdGetRelation(relationId);

	if (!RelationIsValid(r))
		elog(ERROR, "could not open relation with OID %u", relationId);

936 937
	pgstat_initstats(r);

938 939 940
	return r;
}

941
/* ----------------
942
 *		relation_openrv - open any relation specified by a RangeVar
943
 *
944
 *		Same as relation_open, but the relation is specified by a RangeVar.
945 946 947
 * ----------------
 */
Relation
948
relation_openrv(const RangeVar *relation, LOCKMODE lockmode)
949
{
950
	Oid			relOid;
951

952 953
	/*
	 * Check for shared-cache-inval messages before trying to open the
Bruce Momjian's avatar
Bruce Momjian committed
954 955
	 * relation.  This is needed to cover the case where the name identifies a
	 * rel that has been dropped and recreated since the start of our
956
	 * transaction: if we don't flush the old syscache entry then we'll latch
957 958 959
	 * onto that entry and suffer an error when we do RelationIdGetRelation.
	 * Note that relation_open does not need to do this, since a relation's
	 * OID never changes.
960
	 *
961 962
	 * We skip this if asked for NoLock, on the assumption that the caller has
	 * already ensured some appropriate lock is held.
963 964 965 966
	 */
	if (lockmode != NoLock)
		AcceptInvalidationMessages();

967 968 969 970 971 972 973
	/* Look up the appropriate relation using namespace search */
	relOid = RangeVarGetRelid(relation, false);

	/* Let relation_open do the rest */
	return relation_open(relOid, lockmode);
}

974
/* ----------------
975 976
 *		relation_close - close any relation
 *
977
 *		If lockmode is not "NoLock", we then release the specified lock.
978
 *
979 980
 *		Note that it is often sensible to hold a lock beyond relation_close;
 *		in that case, the lock is released automatically at xact end.
981 982
 * ----------------
 */
983 984
void
relation_close(Relation relation, LOCKMODE lockmode)
985
{
986
	LockRelId	relid = relation->rd_lockInfo.lockRelId;
987

988
	Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
989

990 991
	/* The relcache does the real work... */
	RelationClose(relation);
992 993 994

	if (lockmode != NoLock)
		UnlockRelationId(&relid, lockmode);
995
}
996

997

998
/* ----------------
999
 *		heap_open - open a heap relation by relation OID
1000
 *
1001
 *		This is essentially relation_open plus check that the relation
1002 1003
 *		is not an index nor a composite type.  (The caller should also
 *		check that it's not a view before assuming it has storage.)
1004 1005 1006
 * ----------------
 */
Relation
1007
heap_open(Oid relationId, LOCKMODE lockmode)
1008 1009
{
	Relation	r;
1010

1011
	r = relation_open(relationId, lockmode);
1012

1013
	if (r->rd_rel->relkind == RELKIND_INDEX)
1014 1015
		ereport(ERROR,
				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
1016
				 errmsg("\"%s\" is an index",
1017
						RelationGetRelationName(r))));
1018
	else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1019 1020 1021 1022
		ereport(ERROR,
				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
				 errmsg("\"%s\" is a composite type",
						RelationGetRelationName(r))));
1023

1024
	return r;
1025 1026 1027
}

/* ----------------
1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
 *		heap_openrv - open a heap relation specified
 *		by a RangeVar node
 *
 *		As above, but relation is specified by a RangeVar.
 * ----------------
 */
Relation
heap_openrv(const RangeVar *relation, LOCKMODE lockmode)
{
	Relation	r;

	r = relation_openrv(relation, lockmode);

1041
	if (r->rd_rel->relkind == RELKIND_INDEX)
1042 1043
		ereport(ERROR,
				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
1044
				 errmsg("\"%s\" is an index",
1045
						RelationGetRelationName(r))));
1046
	else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1047 1048 1049 1050
		ereport(ERROR,
				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
				 errmsg("\"%s\" is a composite type",
						RelationGetRelationName(r))));
1051

1052
	return r;
1053 1054 1055 1056
}


/* ----------------
1057
 *		heap_beginscan	- begin relation scan
1058
 *
1059
 * heap_beginscan_bm is an alternative entry point for setting up a HeapScanDesc
1060 1061 1062
 * for a bitmap heap scan.  Although that scan technology is really quite
 * unlike a standard seqscan, there is just enough commonality to make it
 * worth using the same data structure.
1063 1064 1065
 * ----------------
 */
HeapScanDesc
1066 1067
heap_beginscan(Relation relation, Snapshot snapshot,
			   int nkeys, ScanKey key)
1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081
{
	return heap_beginscan_internal(relation, snapshot, nkeys, key, false);
}

HeapScanDesc
heap_beginscan_bm(Relation relation, Snapshot snapshot,
				  int nkeys, ScanKey key)
{
	return heap_beginscan_internal(relation, snapshot, nkeys, key, true);
}

static HeapScanDesc
heap_beginscan_internal(Relation relation, Snapshot snapshot,
						int nkeys, ScanKey key, bool is_bitmapscan)
1082
{
1083
	HeapScanDesc scan;
1084

1085 1086
	/*
	 * increment relation ref count while scanning relation
1087
	 *
1088 1089 1090
	 * This is just to make really sure the relcache entry won't go away while
	 * the scan has a pointer to it.  Caller should be holding the rel open
	 * anyway, so this is redundant in all normal scenarios...
1091
	 */
1092
	RelationIncrementReferenceCount(relation);
1093

1094 1095
	/*
	 * allocate and initialize scan descriptor
1096
	 */
1097
	scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1098

1099
	scan->rs_rd = relation;
1100
	scan->rs_snapshot = snapshot;
1101
	scan->rs_nkeys = nkeys;
1102
	scan->rs_bitmapscan = is_bitmapscan;
1103
	scan->rs_strategy = NULL;	/* set in initscan */
1104

1105 1106 1107 1108 1109 1110 1111 1112
	/*
	 * we can use page-at-a-time mode if it's an MVCC-safe snapshot
	 */
	scan->rs_pageatatime = IsMVCCSnapshot(snapshot);

	/* we only need to set this up once */
	scan->rs_ctup.t_tableOid = RelationGetRelid(relation);

1113
	/*
1114 1115
	 * we do this here instead of in initscan() because heap_rescan also calls
	 * initscan() and we don't want to allocate memory again
1116
	 */
1117
	if (nkeys > 0)
1118
		scan->rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1119
	else
1120
		scan->rs_key = NULL;
1121

1122
	initscan(scan, key);
1123

1124
	return scan;
1125 1126 1127
}

/* ----------------
1128
 *		heap_rescan		- restart a relation scan
1129 1130 1131
 * ----------------
 */
void
1132
heap_rescan(HeapScanDesc scan,
1133
			ScanKey key)
1134
{
1135 1136
	/*
	 * unpin scan buffers
1137
	 */
1138 1139
	if (BufferIsValid(scan->rs_cbuf))
		ReleaseBuffer(scan->rs_cbuf);
1140

1141 1142
	/*
	 * reinitialize scan descriptor
1143
	 */
1144
	initscan(scan, key);
1145 1146 1147
}

/* ----------------
1148
 *		heap_endscan	- end relation scan
1149
 *
1150 1151
 *		See how to integrate with index scans.
 *		Check handling if reldesc caching.
1152 1153 1154
 * ----------------
 */
void
1155
heap_endscan(HeapScanDesc scan)
1156
{
1157 1158
	/* Note: no locking manipulations needed */

1159 1160
	/*
	 * unpin scan buffers
1161
	 */
1162 1163
	if (BufferIsValid(scan->rs_cbuf))
		ReleaseBuffer(scan->rs_cbuf);
1164

1165 1166
	/*
	 * decrement relation reference count and free scan descriptor storage
1167
	 */
1168
	RelationDecrementReferenceCount(scan->rs_rd);
1169

1170 1171 1172
	if (scan->rs_key)
		pfree(scan->rs_key);

1173 1174 1175
	if (scan->rs_strategy != NULL)
		FreeAccessStrategy(scan->rs_strategy);

1176
	pfree(scan);
1177 1178 1179
}

/* ----------------
1180
 *		heap_getnext	- retrieve next tuple in scan
1181
 *
1182
 *		Fix to work with index relations.
1183 1184
 *		We don't return the buffer anymore, but you can get it from the
 *		returned HeapTuple.
1185 1186 1187 1188 1189
 * ----------------
 */

#ifdef HEAPDEBUGALL
#define HEAPDEBUG_1 \
1190
	elog(DEBUG2, "heap_getnext([%s,nkeys=%d],dir=%d) called", \
1191
		 RelationGetRelationName(scan->rs_rd), scan->rs_nkeys, (int) direction)
1192
#define HEAPDEBUG_2 \
1193
	elog(DEBUG2, "heap_getnext returning EOS")
1194
#define HEAPDEBUG_3 \
1195
	elog(DEBUG2, "heap_getnext returning tuple")
1196 1197 1198 1199
#else
#define HEAPDEBUG_1
#define HEAPDEBUG_2
#define HEAPDEBUG_3
1200
#endif   /* !defined(HEAPDEBUGALL) */
1201 1202


1203
HeapTuple
1204
heap_getnext(HeapScanDesc scan, ScanDirection direction)
1205
{
1206 1207 1208 1209
	/* Note: no locking manipulations needed */

	HEAPDEBUG_1;				/* heap_getnext( info ) */

1210
	if (scan->rs_pageatatime)
1211
		heapgettup_pagemode(scan, direction,
1212 1213
							scan->rs_nkeys, scan->rs_key);
	else
1214
		heapgettup(scan, direction, scan->rs_nkeys, scan->rs_key);
1215 1216

	if (scan->rs_ctup.t_data == NULL)
1217
	{
1218 1219
		HEAPDEBUG_2;			/* heap_getnext returning EOS */
		return NULL;
1220 1221
	}

1222
	/*
1223 1224
	 * if we get here it means we have a new current scan tuple, so point to
	 * the proper return buffer and return the tuple.
1225
	 */
1226
	HEAPDEBUG_3;				/* heap_getnext returning tuple */
1227

1228
	pgstat_count_heap_getnext(scan->rs_rd);
1229

1230
	return &(scan->rs_ctup);
1231 1232
}

1233 1234
/*
 *	heap_fetch		- retrieve tuple with given tid
1235
 *
1236 1237 1238
 * On entry, tuple->t_self is the TID to fetch.  We pin the buffer holding
 * the tuple, fill in the remaining fields of *tuple, and check the tuple
 * against the specified snapshot.
1239
 *
1240 1241 1242
 * If successful (tuple found and passes snapshot time qual), then *userbuf
 * is set to the buffer holding the tuple and TRUE is returned.  The caller
 * must unpin the buffer when done with the tuple.
1243
 *
1244 1245
 * If the tuple is not found (ie, item number references a deleted slot),
 * then tuple->t_data is set to NULL and FALSE is returned.
1246
 *
1247 1248 1249 1250 1251 1252 1253
 * If the tuple is found but fails the time qual check, then FALSE is returned
 * but tuple->t_data is left pointing to the tuple.
 *
 * keep_buf determines what is done with the buffer in the FALSE-result cases.
 * When the caller specifies keep_buf = true, we retain the pin on the buffer
 * and return it in *userbuf (so the caller must eventually unpin it); when
 * keep_buf = false, the pin is released and *userbuf is set to InvalidBuffer.
1254
 *
1255 1256 1257 1258
 * stats_relation is the relation to charge the heap_fetch operation against
 * for statistical purposes.  (This could be the heap rel itself, an
 * associated index, or NULL to not count the fetch at all.)
 *
1259 1260 1261
 * heap_fetch does not follow HOT chains: only the exact TID requested will
 * be fetched.
 *
1262
 * It is somewhat inconsistent that we ereport() on invalid block number but
1263 1264 1265 1266 1267 1268 1269
 * return false on invalid item number.  There are a couple of reasons though.
 * One is that the caller can relatively easily check the block number for
 * validity, but cannot check the item number without reading the page
 * himself.  Another is that when we are following a t_ctid link, we can be
 * reasonably confident that the page number is valid (since VACUUM shouldn't
 * truncate off the destination page without having killed the referencing
 * tuple first), but the item number might well not be good.
1270
 */
1271
bool
1272
heap_fetch(Relation relation,
1273
		   Snapshot snapshot,
1274
		   HeapTuple tuple,
1275
		   Buffer *userbuf,
1276
		   bool keep_buf,
1277
		   Relation stats_relation)
1278 1279 1280 1281
{
	/* Assume *userbuf is undefined on entry */
	*userbuf = InvalidBuffer;
	return heap_release_fetch(relation, snapshot, tuple,
1282
							  userbuf, keep_buf, stats_relation);
1283 1284 1285 1286 1287 1288 1289 1290 1291
}

/*
 *	heap_release_fetch		- retrieve tuple with given tid
 *
 * This has the same API as heap_fetch except that if *userbuf is not
 * InvalidBuffer on entry, that buffer will be released before reading
 * the new page.  This saves a separate ReleaseBuffer step and hence
 * one entry into the bufmgr when looping through multiple fetches.
1292 1293
 * Also, if *userbuf is the same buffer that holds the target tuple,
 * we avoid bufmgr manipulation altogether.
1294 1295 1296 1297 1298 1299 1300
 */
bool
heap_release_fetch(Relation relation,
				   Snapshot snapshot,
				   HeapTuple tuple,
				   Buffer *userbuf,
				   bool keep_buf,
1301
				   Relation stats_relation)
1302
{
1303
	ItemPointer tid = &(tuple->t_self);
Bruce Momjian's avatar
Bruce Momjian committed
1304 1305 1306 1307
	ItemId		lp;
	Buffer		buffer;
	PageHeader	dp;
	OffsetNumber offnum;
1308
	bool		valid;
1309

1310
	/*
1311
	 * get the buffer from the relation descriptor. Note that this does a
1312
	 * buffer pin, and releases the old *userbuf if not InvalidBuffer.
1313
	 */
1314 1315
	buffer = ReleaseAndReadBuffer(*userbuf, relation,
								  ItemPointerGetBlockNumber(tid));
1316

1317 1318 1319
	/*
	 * Need share lock on buffer to examine tuple commit status.
	 */
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1320
	LockBuffer(buffer, BUFFER_LOCK_SHARE);
1321
	dp = (PageHeader) BufferGetPage(buffer);
1322

1323
	/*
1324 1325
	 * We'd better check for out-of-range offnum in case of VACUUM since the
	 * TID was obtained.
1326 1327
	 */
	offnum = ItemPointerGetOffsetNumber(tid);
1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344
	if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
	{
		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
		if (keep_buf)
			*userbuf = buffer;
		else
		{
			ReleaseBuffer(buffer);
			*userbuf = InvalidBuffer;
		}
		tuple->t_data = NULL;
		return false;
	}

	/*
	 * get the item line pointer corresponding to the requested tid
	 */
1345 1346
	lp = PageGetItemId(dp, offnum);

1347
	/*
1348
	 * Must check for deleted tuple.
1349
	 */
1350
	if (!ItemIdIsNormal(lp))
1351
	{
1352
		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1353 1354 1355 1356 1357 1358 1359
		if (keep_buf)
			*userbuf = buffer;
		else
		{
			ReleaseBuffer(buffer);
			*userbuf = InvalidBuffer;
		}
1360 1361
		tuple->t_data = NULL;
		return false;
1362
	}
1363

1364 1365 1366
	/*
	 * fill in *tuple fields
	 */
1367 1368
	tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
	tuple->t_len = ItemIdGetLength(lp);
1369
	tuple->t_tableOid = RelationGetRelid(relation);
1370

1371
	/*
1372
	 * check time qualification of tuple, then release lock
1373
	 */
1374
	valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1375

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1376 1377
	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);

1378
	if (valid)
1379
	{
1380 1381 1382
		/*
		 * All checks passed, so return the tuple as valid. Caller is now
		 * responsible for releasing the buffer.
1383 1384
		 */
		*userbuf = buffer;
1385

1386 1387 1388
		/* Count the successful fetch against appropriate rel, if any */
		if (stats_relation != NULL)
			pgstat_count_heap_fetch(stats_relation);
1389 1390

		return true;
1391
	}
1392 1393 1394 1395

	/* Tuple failed time qual, but maybe caller wants to see it anyway. */
	if (keep_buf)
		*userbuf = buffer;
1396 1397 1398 1399
	else
	{
		ReleaseBuffer(buffer);
		*userbuf = InvalidBuffer;
1400 1401 1402
	}

	return false;
1403 1404
}

1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541
/*
 *	heap_hot_search_buffer	- search HOT chain for tuple satisfying snapshot
 *
 * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
 * of a HOT chain), and buffer is the buffer holding this tuple.  We search
 * for the first chain member satisfying the given snapshot.  If one is
 * found, we update *tid to reference that tuple's offset number, and
 * return TRUE.  If no match, return FALSE without modifying *tid.
 *
 * If all_dead is not NULL, we check non-visible tuples to see if they are
 * globally dead; *all_dead is set TRUE if all members of the HOT chain
 * are vacuumable, FALSE if not.
 *
 * Unlike heap_fetch, the caller must already have pin and (at least) share
 * lock on the buffer; it is still pinned/locked at exit.  Also unlike
 * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
 */
bool
heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
					   bool *all_dead)
{
	Page dp = (Page) BufferGetPage(buffer);
	TransactionId prev_xmax = InvalidTransactionId;
	OffsetNumber offnum;
	bool at_chain_start;

	if (all_dead)
		*all_dead = true;

	Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer));
	offnum = ItemPointerGetOffsetNumber(tid);
	at_chain_start = true;

	/* Scan through possible multiple members of HOT-chain */
	for (;;)
	{
		ItemId lp;
		HeapTupleData heapTuple;

		/* check for bogus TID */
		if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
			break;

		lp = PageGetItemId(dp, offnum);

		/* check for unused, dead, or redirected items */
		if (!ItemIdIsNormal(lp))
		{
			/* We should only see a redirect at start of chain */
			if (ItemIdIsRedirected(lp) && at_chain_start)
			{
				/* Follow the redirect */
				offnum = ItemIdGetRedirect(lp);
				at_chain_start = false;
				continue;
			}
			/* else must be end of chain */
			break;
		}

		heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
		heapTuple.t_len = ItemIdGetLength(lp);

		/*
		 * Shouldn't see a HEAP_ONLY tuple at chain start.
		 */
		if (at_chain_start && HeapTupleIsHeapOnly(&heapTuple))
			break;

		/*
		 * The xmin should match the previous xmax value, else chain is broken.
		 */
		if (TransactionIdIsValid(prev_xmax) &&
			!TransactionIdEquals(prev_xmax,
								 HeapTupleHeaderGetXmin(heapTuple.t_data)))
			break;

		/* If it's visible per the snapshot, we must return it */
		if (HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer))
		{
			ItemPointerSetOffsetNumber(tid, offnum);
			if (all_dead)
				*all_dead = false;
			return true;
		}

		/*
		 * If we can't see it, maybe no one else can either.  At caller
		 * request, check whether all chain members are dead to all
		 * transactions.
		 */
		if (all_dead && *all_dead &&
			HeapTupleSatisfiesVacuum(heapTuple.t_data, RecentGlobalXmin,
									 buffer) != HEAPTUPLE_DEAD)
			*all_dead = false;

		/*
		 * Check to see if HOT chain continues past this tuple; if so
		 * fetch the next offnum and loop around.
		 */
		if (HeapTupleIsHotUpdated(&heapTuple))
		{
			Assert(ItemPointerGetBlockNumber(&heapTuple.t_data->t_ctid) ==
				   ItemPointerGetBlockNumber(tid));
			offnum = ItemPointerGetOffsetNumber(&heapTuple.t_data->t_ctid);
			at_chain_start = false;
			prev_xmax = HeapTupleHeaderGetXmax(heapTuple.t_data);
		}
		else
			break;			/* end of chain */
	}

	return false;
}

/*
 *	heap_hot_search		- search HOT chain for tuple satisfying snapshot
 *
 * This has the same API as heap_hot_search_buffer, except that the caller
 * does not provide the buffer containing the page, rather we access it
 * locally.
 */
bool
heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot,
				bool *all_dead)
{
	bool	result;
	Buffer	buffer;

	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
	LockBuffer(buffer, BUFFER_LOCK_SHARE);
	result = heap_hot_search_buffer(tid, buffer, snapshot, all_dead);
	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
	ReleaseBuffer(buffer);
	return result;
}

1542
/*
1543
 *	heap_get_latest_tid -  get the latest tid of a specified tuple
1544 1545 1546 1547 1548 1549
 *
 * Actually, this gets the latest version that is visible according to
 * the passed snapshot.  You can pass SnapshotDirty to get the very latest,
 * possibly uncommitted version.
 *
 * *tid is both an input and an output parameter: it is updated to
1550
 * show the latest version of the row.	Note that it will not be changed
1551
 * if no version of the row passes the snapshot test.
1552
 */
1553
void
1554
heap_get_latest_tid(Relation relation,
1555 1556
					Snapshot snapshot,
					ItemPointer tid)
1557
{
1558
	BlockNumber blk;
1559
	ItemPointerData ctid;
1560
	TransactionId priorXmax;
1561

1562 1563 1564
	/* this is to avoid Assert failures on bad input */
	if (!ItemPointerIsValid(tid))
		return;
1565

1566
	/*
1567 1568 1569 1570
	 * Since this can be called with user-supplied TID, don't trust the input
	 * too much.  (RelationGetNumberOfBlocks is an expensive check, so we
	 * don't check t_ctid links again this way.  Note that it would not do to
	 * call it just once and save the result, either.)
1571
	 */
1572 1573 1574 1575
	blk = ItemPointerGetBlockNumber(tid);
	if (blk >= RelationGetNumberOfBlocks(relation))
		elog(ERROR, "block number %u is out of range for relation \"%s\"",
			 blk, RelationGetRelationName(relation));
1576

1577
	/*
1578 1579 1580
	 * Loop to chase down t_ctid links.  At top of loop, ctid is the tuple we
	 * need to examine, and *tid is the TID we will return if ctid turns out
	 * to be bogus.
1581 1582 1583 1584
	 *
	 * Note that we will loop until we reach the end of the t_ctid chain.
	 * Depending on the snapshot passed, there might be at most one visible
	 * version of the row, but we don't try to optimize for that.
1585
	 */
1586 1587 1588 1589 1590 1591 1592 1593 1594 1595
	ctid = *tid;
	priorXmax = InvalidTransactionId;	/* cannot check first XMIN */
	for (;;)
	{
		Buffer		buffer;
		PageHeader	dp;
		OffsetNumber offnum;
		ItemId		lp;
		HeapTupleData tp;
		bool		valid;
1596

1597 1598 1599 1600 1601 1602
		/*
		 * Read, pin, and lock the page.
		 */
		buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
		LockBuffer(buffer, BUFFER_LOCK_SHARE);
		dp = (PageHeader) BufferGetPage(buffer);
1603

1604 1605
		/*
		 * Check for bogus item number.  This is not treated as an error
1606 1607
		 * condition because it can happen while following a t_ctid link. We
		 * just assume that the prior tid is OK and return it unchanged.
1608 1609 1610 1611
		 */
		offnum = ItemPointerGetOffsetNumber(&ctid);
		if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
		{
1612
			UnlockReleaseBuffer(buffer);
1613 1614 1615
			break;
		}
		lp = PageGetItemId(dp, offnum);
1616
		if (!ItemIdIsNormal(lp))
1617
		{
1618
			UnlockReleaseBuffer(buffer);
1619 1620
			break;
		}
1621

1622 1623 1624 1625
		/* OK to access the tuple */
		tp.t_self = ctid;
		tp.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
		tp.t_len = ItemIdGetLength(lp);
1626

1627 1628 1629 1630 1631
		/*
		 * After following a t_ctid link, we might arrive at an unrelated
		 * tuple.  Check for XMIN match.
		 */
		if (TransactionIdIsValid(priorXmax) &&
1632
		  !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
1633
		{
1634
			UnlockReleaseBuffer(buffer);
1635 1636
			break;
		}
1637

1638 1639 1640 1641
		/*
		 * Check time qualification of tuple; if visible, set it as the new
		 * result candidate.
		 */
1642
		valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
1643 1644
		if (valid)
			*tid = ctid;
1645

1646 1647 1648 1649 1650 1651
		/*
		 * If there's a valid t_ctid link, follow it, else we're done.
		 */
		if ((tp.t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED)) ||
			ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
		{
1652
			UnlockReleaseBuffer(buffer);
1653 1654
			break;
		}
1655

1656 1657
		ctid = tp.t_data->t_ctid;
		priorXmax = HeapTupleHeaderGetXmax(tp.t_data);
1658
		UnlockReleaseBuffer(buffer);
1659
	}							/* end of loop */
1660 1661
}

1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689

/*
 * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
 *
 * This is called after we have waited for the XMAX transaction to terminate.
 * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
 * be set on exit.  If the transaction committed, we set the XMAX_COMMITTED
 * hint bit if possible --- but beware that that may not yet be possible,
 * if the transaction committed asynchronously.  Hence callers should look
 * only at XMAX_INVALID.
 */
static void
UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
{
	Assert(TransactionIdEquals(HeapTupleHeaderGetXmax(tuple), xid));

	if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
	{
		if (TransactionIdDidCommit(xid))
			HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
								 xid);
		else
			HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
								 InvalidTransactionId);
	}
}


1690 1691
/*
 *	heap_insert		- insert tuple into a heap
1692
 *
1693 1694
 * The new tuple is stamped with current transaction ID and the specified
 * command ID.
1695 1696 1697 1698
 *
 * If use_wal is false, the new tuple is not logged in WAL, even for a
 * non-temp relation.  Safe usage of this behavior requires that we arrange
 * that all new tuples go into new pages not containing any tuples from other
1699 1700
 * transactions, and that the relation gets fsync'd before commit.
 * (See also heap_sync() comments)
1701 1702 1703
 *
 * use_fsm is passed directly to RelationGetBufferForTuple, which see for
 * more info.
1704
 *
1705 1706 1707
 * Note that use_wal and use_fsm will be applied when inserting into the
 * heap's TOAST table, too, if the tuple requires any out-of-line data.
 *
1708 1709 1710
 * The return value is the OID assigned to the tuple (either here or by the
 * caller), or InvalidOid if no OID.  The header fields of *tup are updated
 * to match the stored tuple; in particular tup->t_self receives the actual
1711
 * TID where the tuple was stored.	But note that any toasting of fields
1712
 * within the tuple data is NOT reflected into *tup.
1713 1714
 */
Oid
1715 1716
heap_insert(Relation relation, HeapTuple tup, CommandId cid,
			bool use_wal, bool use_fsm)
1717
{
1718
	TransactionId xid = GetCurrentTransactionId();
1719
	HeapTuple	heaptup;
1720
	Buffer		buffer;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1721

1722 1723
	if (relation->rd_rel->relhasoids)
	{
1724 1725 1726 1727
#ifdef NOT_USED
		/* this is redundant with an Assert in HeapTupleSetOid */
		Assert(tup->t_data->t_infomask & HEAP_HASOID);
#endif
Bruce Momjian's avatar
Bruce Momjian committed
1728

1729
		/*
1730 1731 1732 1733 1734 1735
		 * If the object id of this tuple has already been assigned, trust the
		 * caller.	There are a couple of ways this can happen.  At initial db
		 * creation, the backend program sets oids for tuples. When we define
		 * an index, we set the oid.  Finally, in the future, we may allow
		 * users to set their own object ids in order to support a persistent
		 * object store (objects need to contain pointers to one another).
1736
		 */
1737
		if (!OidIsValid(HeapTupleGetOid(tup)))
1738
			HeapTupleSetOid(tup, GetNewOid(relation));
1739
	}
1740 1741 1742 1743 1744
	else
	{
		/* check there is not space for an OID */
		Assert(!(tup->t_data->t_infomask & HEAP_HASOID));
	}
1745

1746
	tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
1747
	tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
1748
	tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
1749
	HeapTupleHeaderSetXmin(tup->t_data, xid);
1750
	HeapTupleHeaderSetCmin(tup->t_data, cid);
1751
	HeapTupleHeaderSetXmax(tup->t_data, 0);		/* for cleanliness */
1752
	tup->t_tableOid = RelationGetRelid(relation);
1753

1754 1755
	/*
	 * If the new tuple is too big for storage or contains already toasted
1756
	 * out-of-line attributes from some other relation, invoke the toaster.
1757
	 *
1758 1759
	 * Note: below this point, heaptup is the data we actually intend to store
	 * into the relation; tup is the caller's original untoasted data.
Jan Wieck's avatar
TOAST  
Jan Wieck committed
1760
	 */
1761
	if (relation->rd_rel->relkind != RELKIND_RELATION)
1762 1763 1764 1765 1766 1767
	{
		/* toast table entries should never be recursively toasted */
		Assert(!HeapTupleHasExternal(tup));
		heaptup = tup;
	}
	else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
1768 1769
		heaptup = toast_insert_or_update(relation, tup, NULL,
										 use_wal, use_fsm);
1770 1771
	else
		heaptup = tup;
Jan Wieck's avatar
TOAST  
Jan Wieck committed
1772

1773
	/* Find buffer to insert this tuple into */
1774
	buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
1775
									   InvalidBuffer, use_fsm);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1776

1777
	/* NO EREPORT(ERROR) from here till changes are logged */
1778
	START_CRIT_SECTION();
1779

1780
	RelationPutHeapTuple(relation, buffer, heaptup);
1781

1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792
	/*
	 * XXX Should we set PageSetPrunable on this page ?
	 *
	 * The inserting transaction may eventually abort thus making this tuple
	 * DEAD and hence available for pruning. Though we don't want to optimize
	 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
	 * aborted tuple will never be pruned until next vacuum is triggered.
	 *
	 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
	 */

1793 1794
	MarkBufferDirty(buffer);

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1795
	/* XLOG stuff */
1796
	if (use_wal && !relation->rd_istemp)
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1797
	{
1798 1799 1800 1801 1802 1803
		xl_heap_insert xlrec;
		xl_heap_header xlhdr;
		XLogRecPtr	recptr;
		XLogRecData rdata[3];
		Page		page = BufferGetPage(buffer);
		uint8		info = XLOG_HEAP_INSERT;
1804

1805
		xlrec.target.node = relation->rd_node;
1806
		xlrec.target.tid = heaptup->t_self;
1807
		rdata[0].data = (char *) &xlrec;
1808
		rdata[0].len = SizeOfHeapInsert;
1809
		rdata[0].buffer = InvalidBuffer;
1810 1811
		rdata[0].next = &(rdata[1]);

1812
		xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
1813 1814
		xlhdr.t_infomask = heaptup->t_data->t_infomask;
		xlhdr.t_hoff = heaptup->t_data->t_hoff;
Bruce Momjian's avatar
Bruce Momjian committed
1815

1816
		/*
1817 1818 1819
		 * note we mark rdata[1] as belonging to buffer; if XLogInsert decides
		 * to write the whole page to the xlog, we don't need to store
		 * xl_heap_header in the xlog.
1820
		 */
1821
		rdata[1].data = (char *) &xlhdr;
1822
		rdata[1].len = SizeOfHeapHeader;
1823 1824
		rdata[1].buffer = buffer;
		rdata[1].buffer_std = true;
1825 1826
		rdata[1].next = &(rdata[2]);

1827
		/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
1828 1829
		rdata[2].data = (char *) heaptup->t_data + offsetof(HeapTupleHeaderData, t_bits);
		rdata[2].len = heaptup->t_len - offsetof(HeapTupleHeaderData, t_bits);
1830 1831
		rdata[2].buffer = buffer;
		rdata[2].buffer_std = true;
1832 1833
		rdata[2].next = NULL;

1834
		/*
1835 1836 1837
		 * If this is the single and first tuple on page, we can reinit the
		 * page instead of restoring the whole thing.  Set flag, and hide
		 * buffer references from XLogInsert.
1838
		 */
1839
		if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
1840 1841 1842 1843 1844
			PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
		{
			info |= XLOG_HEAP_INIT_PAGE;
			rdata[1].buffer = rdata[2].buffer = InvalidBuffer;
		}
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1845

1846 1847 1848
		recptr = XLogInsert(RM_HEAP_ID, info, rdata);

		PageSetLSN(page, recptr);
1849
		PageSetTLI(page, ThisTimeLineID);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1850
	}
1851

1852
	END_CRIT_SECTION();
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1853

1854
	UnlockReleaseBuffer(buffer);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1855

1856
	/*
1857
	 * If tuple is cachable, mark it for invalidation from the caches in case
1858 1859 1860
	 * we abort.  Note it is OK to do this after releasing the buffer, because
	 * the heaptup data structure is all in local memory, not in the shared
	 * buffer.
1861
	 */
1862
	CacheInvalidateHeapTuple(relation, heaptup);
1863

1864
	pgstat_count_heap_insert(relation);
1865

1866 1867 1868 1869 1870 1871 1872 1873 1874 1875
	/*
	 * If heaptup is a private copy, release it.  Don't forget to copy t_self
	 * back to the caller's image, too.
	 */
	if (heaptup != tup)
	{
		tup->t_self = heaptup->t_self;
		heap_freetuple(heaptup);
	}

1876
	return HeapTupleGetOid(tup);
1877 1878
}

1879 1880 1881 1882
/*
 *	simple_heap_insert - insert a tuple
 *
 * Currently, this routine differs from heap_insert only in supplying
1883 1884 1885 1886
 * a default command ID and not allowing access to the speedup options.
 *
 * This should be used rather than using heap_insert directly in most places
 * where we are modifying system catalogs.
1887 1888 1889 1890
 */
Oid
simple_heap_insert(Relation relation, HeapTuple tup)
{
1891
	return heap_insert(relation, tup, GetCurrentCommandId(), true, true);
1892 1893
}

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1894
/*
1895
 *	heap_delete - delete a tuple
1896 1897 1898
 *
 * NB: do not call this directly unless you are prepared to deal with
 * concurrent-update conditions.  Use simple_heap_delete instead.
1899
 *
1900
 *	relation - table to be modified (caller must hold suitable lock)
1901 1902
 *	tid - TID of tuple to be deleted
 *	ctid - output parameter, used only for failure case (see below)
1903 1904 1905
 *	update_xmax - output parameter, used only for failure case (see below)
 *	cid - delete command ID (used for visibility test, and stored into
 *		cmax if successful)
1906
 *	crosscheck - if not InvalidSnapshot, also check tuple against this
1907 1908
 *	wait - true if should wait for any conflicting update to commit/abort
 *
1909 1910 1911
 * Normal, successful return value is HeapTupleMayBeUpdated, which
 * actually means we did delete it.  Failure return codes are
 * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
1912 1913 1914 1915 1916 1917
 * (the last only possible if wait == false).
 *
 * In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
 * If t_ctid is the same as tid, the tuple was deleted; if different, the
 * tuple was updated, and t_ctid is the location of the replacement tuple.
 * (t_xmax is needed to verify that the replacement tuple matches.)
1918
 */
1919
HTSU_Result
1920
heap_delete(Relation relation, ItemPointer tid,
1921 1922
			ItemPointer ctid, TransactionId *update_xmax,
			CommandId cid, Snapshot crosscheck, bool wait)
1923
{
1924
	HTSU_Result result;
1925
	TransactionId xid = GetCurrentTransactionId();
Bruce Momjian's avatar
Bruce Momjian committed
1926 1927 1928 1929
	ItemId		lp;
	HeapTupleData tp;
	PageHeader	dp;
	Buffer		buffer;
1930
	bool		have_tuple_lock = false;
1931
	bool		iscombo;
1932 1933 1934

	Assert(ItemPointerIsValid(tid));

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1935 1936
	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1937

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1938 1939
	dp = (PageHeader) BufferGetPage(buffer);
	lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(tid));
1940
	Assert(ItemIdIsNormal(lp));
1941 1942

	tp.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
1943 1944
	tp.t_len = ItemIdGetLength(lp);
	tp.t_self = *tid;
Bruce Momjian's avatar
Bruce Momjian committed
1945

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1946
l1:
1947
	result = HeapTupleSatisfiesUpdate(tp.t_data, cid, buffer);
Bruce Momjian's avatar
Bruce Momjian committed
1948

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1949
	if (result == HeapTupleInvisible)
1950
	{
1951
		UnlockReleaseBuffer(buffer);
1952
		elog(ERROR, "attempted to delete invisible tuple");
1953
	}
1954
	else if (result == HeapTupleBeingUpdated && wait)
1955
	{
1956
		TransactionId xwait;
1957
		uint16		infomask;
1958

1959 1960 1961 1962 1963 1964 1965
		/* must copy state data before unlocking buffer */
		xwait = HeapTupleHeaderGetXmax(tp.t_data);
		infomask = tp.t_data->t_infomask;

		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);

		/*
1966 1967
		 * Acquire tuple lock to establish our priority for the tuple (see
		 * heap_lock_tuple).  LockTuple will release us when we are
1968 1969
		 * next-in-line for the tuple.
		 *
1970 1971
		 * If we are forced to "start over" below, we keep the tuple lock;
		 * this arranges that we stay at the head of the line while rechecking
1972
		 * tuple state.
1973 1974 1975 1976 1977 1978 1979
		 */
		if (!have_tuple_lock)
		{
			LockTuple(relation, &(tp.t_self), ExclusiveLock);
			have_tuple_lock = true;
		}

1980
		/*
1981 1982 1983
		 * Sleep until concurrent transaction ends.  Note that we don't care
		 * if the locker has an exclusive or shared lock, because we need
		 * exclusive.
1984
		 */
1985 1986

		if (infomask & HEAP_XMAX_IS_MULTI)
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
1987
		{
1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002
			/* wait for multixact */
			MultiXactIdWait((MultiXactId) xwait);
			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

			/*
			 * If xwait had just locked the tuple then some other xact could
			 * update this tuple before we get to this point.  Check for xmax
			 * change, and start over if so.
			 */
			if (!(tp.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
				!TransactionIdEquals(HeapTupleHeaderGetXmax(tp.t_data),
									 xwait))
				goto l1;

			/*
2003 2004 2005 2006 2007 2008
			 * You might think the multixact is necessarily done here, but not
			 * so: it could have surviving members, namely our own xact or
			 * other subxacts of this backend.	It is legal for us to delete
			 * the tuple in either case, however (the latter case is
			 * essentially a situation of upgrading our former shared lock to
			 * exclusive).	We don't bother changing the on-disk hint bits
2009 2010
			 * since we are about to overwrite the xmax altogether.
			 */
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2011
		}
2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027
		else
		{
			/* wait for regular transaction to end */
			XactLockTableWait(xwait);
			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

			/*
			 * xwait is done, but if xwait had just locked the tuple then some
			 * other xact could update this tuple before we get to this point.
			 * Check for xmax change, and start over if so.
			 */
			if ((tp.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
				!TransactionIdEquals(HeapTupleHeaderGetXmax(tp.t_data),
									 xwait))
				goto l1;

2028 2029
			/* Otherwise check if it committed or aborted */
			UpdateXmaxHintBits(tp.t_data, buffer, xwait);
2030 2031 2032
		}

		/*
2033 2034
		 * We may overwrite if previous xmax aborted, or if it committed but
		 * only locked the tuple without updating it.
2035 2036 2037
		 */
		if (tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
									 HEAP_IS_LOCKED))
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2038 2039 2040 2041
			result = HeapTupleMayBeUpdated;
		else
			result = HeapTupleUpdated;
	}
2042

2043
	if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
2044 2045
	{
		/* Perform additional check for serializable RI updates */
2046
		if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
2047 2048 2049
			result = HeapTupleUpdated;
	}

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2050 2051
	if (result != HeapTupleMayBeUpdated)
	{
2052 2053 2054
		Assert(result == HeapTupleSelfUpdated ||
			   result == HeapTupleUpdated ||
			   result == HeapTupleBeingUpdated);
2055
		Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
2056
		*ctid = tp.t_data->t_ctid;
2057
		*update_xmax = HeapTupleHeaderGetXmax(tp.t_data);
2058
		UnlockReleaseBuffer(buffer);
2059 2060
		if (have_tuple_lock)
			UnlockTuple(relation, &(tp.t_self), ExclusiveLock);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2061
		return result;
2062 2063
	}

2064 2065 2066
	/* replace cid with a combo cid if necessary */
	HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);

2067
	START_CRIT_SECTION();
2068

2069 2070
	/*
	 * If this transaction commits, the tuple will become DEAD sooner or
2071 2072 2073 2074
	 * later.  Set flag that this page is a candidate for pruning once our xid
	 * falls below the OldestXmin horizon.  If the transaction finally aborts,
	 * the subsequent page pruning will be a no-op and the hint will be
	 * cleared.
2075
	 */
2076
	PageSetPrunable(dp, xid);
2077

2078 2079
	/* store transaction information of xact deleting the tuple */
	tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
2080
							   HEAP_XMAX_INVALID |
2081 2082
							   HEAP_XMAX_IS_MULTI |
							   HEAP_IS_LOCKED |
2083
							   HEAP_MOVED);
2084
	HeapTupleHeaderClearHotUpdated(tp.t_data);
2085
	HeapTupleHeaderSetXmax(tp.t_data, xid);
2086
	HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
2087 2088
	/* Make sure there is no forward chain link in t_ctid */
	tp.t_data->t_ctid = tp.t_self;
2089

2090 2091
	MarkBufferDirty(buffer);

2092
	/* XLOG stuff */
2093
	if (!relation->rd_istemp)
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2094
	{
2095 2096 2097
		xl_heap_delete xlrec;
		XLogRecPtr	recptr;
		XLogRecData rdata[2];
2098

2099 2100
		xlrec.target.node = relation->rd_node;
		xlrec.target.tid = tp.t_self;
2101
		rdata[0].data = (char *) &xlrec;
2102
		rdata[0].len = SizeOfHeapDelete;
2103
		rdata[0].buffer = InvalidBuffer;
2104 2105 2106 2107
		rdata[0].next = &(rdata[1]);

		rdata[1].data = NULL;
		rdata[1].len = 0;
2108 2109
		rdata[1].buffer = buffer;
		rdata[1].buffer_std = true;
2110 2111 2112
		rdata[1].next = NULL;

		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE, rdata);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2113

2114
		PageSetLSN(dp, recptr);
2115
		PageSetTLI(dp, ThisTimeLineID);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2116
	}
2117

2118
	END_CRIT_SECTION();
2119

2120 2121
	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);

2122
	/*
2123
	 * If the tuple has toasted out-of-line attributes, we need to delete
2124 2125 2126
	 * those items too.  We have to do this before releasing the buffer
	 * because we need to look at the contents of the tuple, but it's OK to
	 * release the content lock on the buffer first.
Jan Wieck's avatar
TOAST  
Jan Wieck committed
2127
	 */
2128
	if (relation->rd_rel->relkind != RELKIND_RELATION)
2129 2130 2131 2132 2133
	{
		/* toast table entries should never be recursively toasted */
		Assert(!HeapTupleHasExternal(&tp));
	}
	else if (HeapTupleHasExternal(&tp))
2134
		toast_delete(relation, &tp);
Jan Wieck's avatar
TOAST  
Jan Wieck committed
2135

2136
	/*
2137
	 * Mark tuple for invalidation from system caches at next command
2138 2139
	 * boundary. We have to do this before releasing the buffer because we
	 * need to look at the contents of the tuple.
2140
	 */
2141
	CacheInvalidateHeapTuple(relation, &tp);
2142

2143 2144
	/* Now we can release the buffer */
	ReleaseBuffer(buffer);
2145

2146 2147 2148 2149 2150 2151
	/*
	 * Release the lmgr tuple lock, if we had it.
	 */
	if (have_tuple_lock)
		UnlockTuple(relation, &(tp.t_self), ExclusiveLock);

2152
	pgstat_count_heap_delete(relation);
2153

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2154
	return HeapTupleMayBeUpdated;
2155 2156
}

2157 2158 2159 2160 2161
/*
 *	simple_heap_delete - delete a tuple
 *
 * This routine may be used to delete a tuple when concurrent updates of
 * the target tuple are not expected (for example, because we have a lock
2162
 * on the relation associated with the tuple).	Any failure is reported
2163
 * via ereport().
2164 2165 2166 2167
 */
void
simple_heap_delete(Relation relation, ItemPointer tid)
{
2168
	HTSU_Result result;
2169 2170
	ItemPointerData update_ctid;
	TransactionId update_xmax;
2171

2172
	result = heap_delete(relation, tid,
2173
						 &update_ctid, &update_xmax,
2174
						 GetCurrentCommandId(), InvalidSnapshot,
Bruce Momjian's avatar
Bruce Momjian committed
2175
						 true /* wait for commit */ );
2176 2177 2178 2179
	switch (result)
	{
		case HeapTupleSelfUpdated:
			/* Tuple was already updated in current command? */
2180
			elog(ERROR, "tuple already updated by self");
2181 2182 2183 2184 2185 2186 2187
			break;

		case HeapTupleMayBeUpdated:
			/* done successfully */
			break;

		case HeapTupleUpdated:
2188
			elog(ERROR, "tuple concurrently updated");
2189 2190 2191
			break;

		default:
2192
			elog(ERROR, "unrecognized heap_delete status: %u", result);
2193 2194 2195 2196
			break;
	}
}

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2197
/*
2198
 *	heap_update - replace a tuple
2199 2200 2201
 *
 * NB: do not call this directly unless you are prepared to deal with
 * concurrent-update conditions.  Use simple_heap_update instead.
2202
 *
2203
 *	relation - table to be modified (caller must hold suitable lock)
2204 2205 2206
 *	otid - TID of old tuple to be replaced
 *	newtup - newly constructed tuple data to store
 *	ctid - output parameter, used only for failure case (see below)
2207 2208 2209
 *	update_xmax - output parameter, used only for failure case (see below)
 *	cid - update command ID (used for visibility test, and stored into
 *		cmax/cmin if successful)
2210
 *	crosscheck - if not InvalidSnapshot, also check old tuple against this
2211 2212
 *	wait - true if should wait for any conflicting update to commit/abort
 *
2213 2214 2215
 * Normal, successful return value is HeapTupleMayBeUpdated, which
 * actually means we *did* update it.  Failure return codes are
 * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
2216 2217
 * (the last only possible if wait == false).
 *
2218 2219
 * On success, the header fields of *newtup are updated to match the new
 * stored tuple; in particular, newtup->t_self is set to the TID where the
2220 2221
 * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
 * update was done.  However, any TOAST changes in the new tuple's
2222
 * data are not reflected into *newtup.
2223 2224 2225 2226 2227
 *
 * In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
 * If t_ctid is the same as otid, the tuple was deleted; if different, the
 * tuple was updated, and t_ctid is the location of the replacement tuple.
 * (t_xmax is needed to verify that the replacement tuple matches.)
2228
 */
2229
HTSU_Result
2230
heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
2231 2232
			ItemPointer ctid, TransactionId *update_xmax,
			CommandId cid, Snapshot crosscheck, bool wait)
2233
{
2234
	HTSU_Result result;
2235
	TransactionId xid = GetCurrentTransactionId();
2236
	Bitmapset  *hot_attrs;
Bruce Momjian's avatar
Bruce Momjian committed
2237 2238
	ItemId		lp;
	HeapTupleData oldtup;
2239
	HeapTuple	heaptup;
Bruce Momjian's avatar
Bruce Momjian committed
2240
	PageHeader	dp;
2241 2242 2243 2244
	Buffer		buffer,
				newbuf;
	bool		need_toast,
				already_marked;
2245 2246
	Size		newtupsize,
				pagefree;
2247
	bool		have_tuple_lock = false;
2248
	bool		iscombo;
2249
	bool		use_hot_update = false;
2250

2251 2252
	Assert(ItemPointerIsValid(otid));

2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266
	/*
	 * Fetch the list of attributes to be checked for HOT update.  This is
	 * wasted effort if we fail to update or have to put the new tuple on
	 * a different page.  But we must compute the list before obtaining
	 * buffer lock --- in the worst case, if we are doing an update on one
	 * of the relevant system catalogs, we could deadlock if we try to
	 * fetch the list later.  In any case, the relcache caches the data
	 * so this is usually pretty cheap.
	 *
	 * Note that we get a copy here, so we need not worry about relcache
	 * flush happening midway through.
	 */
	hot_attrs = RelationGetIndexAttrBitmap(relation);

2267
	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(otid));
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2268
	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2269

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2270
	dp = (PageHeader) BufferGetPage(buffer);
2271
	lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(otid));
2272
	Assert(ItemIdIsNormal(lp));
2273

2274 2275 2276
	oldtup.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
	oldtup.t_len = ItemIdGetLength(lp);
	oldtup.t_self = *otid;
2277

2278 2279 2280
	/*
	 * Note: beyond this point, use oldtup not otid to refer to old tuple.
	 * otid may very well point at newtup->t_self, which we will overwrite
2281 2282
	 * with the new tuple's location, so there's great risk of confusion if we
	 * use otid anymore.
2283
	 */
2284

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2285
l2:
2286
	result = HeapTupleSatisfiesUpdate(oldtup.t_data, cid, buffer);
Bruce Momjian's avatar
Bruce Momjian committed
2287

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2288
	if (result == HeapTupleInvisible)
2289
	{
2290
		UnlockReleaseBuffer(buffer);
2291
		elog(ERROR, "attempted to update invisible tuple");
2292
	}
2293
	else if (result == HeapTupleBeingUpdated && wait)
2294
	{
2295
		TransactionId xwait;
2296
		uint16		infomask;
2297

2298 2299 2300 2301 2302 2303 2304
		/* must copy state data before unlocking buffer */
		xwait = HeapTupleHeaderGetXmax(oldtup.t_data);
		infomask = oldtup.t_data->t_infomask;

		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);

		/*
2305 2306
		 * Acquire tuple lock to establish our priority for the tuple (see
		 * heap_lock_tuple).  LockTuple will release us when we are
2307 2308
		 * next-in-line for the tuple.
		 *
2309 2310
		 * If we are forced to "start over" below, we keep the tuple lock;
		 * this arranges that we stay at the head of the line while rechecking
2311
		 * tuple state.
2312 2313 2314 2315 2316 2317 2318
		 */
		if (!have_tuple_lock)
		{
			LockTuple(relation, &(oldtup.t_self), ExclusiveLock);
			have_tuple_lock = true;
		}

2319
		/*
2320 2321 2322
		 * Sleep until concurrent transaction ends.  Note that we don't care
		 * if the locker has an exclusive or shared lock, because we need
		 * exclusive.
2323
		 */
2324 2325

		if (infomask & HEAP_XMAX_IS_MULTI)
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2326
		{
2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341
			/* wait for multixact */
			MultiXactIdWait((MultiXactId) xwait);
			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

			/*
			 * If xwait had just locked the tuple then some other xact could
			 * update this tuple before we get to this point.  Check for xmax
			 * change, and start over if so.
			 */
			if (!(oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
				!TransactionIdEquals(HeapTupleHeaderGetXmax(oldtup.t_data),
									 xwait))
				goto l2;

			/*
2342 2343 2344 2345 2346 2347
			 * You might think the multixact is necessarily done here, but not
			 * so: it could have surviving members, namely our own xact or
			 * other subxacts of this backend.	It is legal for us to update
			 * the tuple in either case, however (the latter case is
			 * essentially a situation of upgrading our former shared lock to
			 * exclusive).	We don't bother changing the on-disk hint bits
2348 2349
			 * since we are about to overwrite the xmax altogether.
			 */
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2350
		}
2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366
		else
		{
			/* wait for regular transaction to end */
			XactLockTableWait(xwait);
			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

			/*
			 * xwait is done, but if xwait had just locked the tuple then some
			 * other xact could update this tuple before we get to this point.
			 * Check for xmax change, and start over if so.
			 */
			if ((oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
				!TransactionIdEquals(HeapTupleHeaderGetXmax(oldtup.t_data),
									 xwait))
				goto l2;

2367 2368
			/* Otherwise check if it committed or aborted */
			UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
2369 2370 2371
		}

		/*
2372 2373
		 * We may overwrite if previous xmax aborted, or if it committed but
		 * only locked the tuple without updating it.
2374 2375 2376
		 */
		if (oldtup.t_data->t_infomask & (HEAP_XMAX_INVALID |
										 HEAP_IS_LOCKED))
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2377 2378 2379 2380
			result = HeapTupleMayBeUpdated;
		else
			result = HeapTupleUpdated;
	}
2381

2382
	if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
2383 2384
	{
		/* Perform additional check for serializable RI updates */
2385
		if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
2386 2387 2388
			result = HeapTupleUpdated;
	}

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2389 2390
	if (result != HeapTupleMayBeUpdated)
	{
2391 2392 2393
		Assert(result == HeapTupleSelfUpdated ||
			   result == HeapTupleUpdated ||
			   result == HeapTupleBeingUpdated);
2394
		Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
2395
		*ctid = oldtup.t_data->t_ctid;
2396
		*update_xmax = HeapTupleHeaderGetXmax(oldtup.t_data);
2397
		UnlockReleaseBuffer(buffer);
2398 2399
		if (have_tuple_lock)
			UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
2400
		bms_free(hot_attrs);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2401
		return result;
2402 2403
	}

2404
	/* Fill in OID and transaction status data for newtup */
2405 2406
	if (relation->rd_rel->relhasoids)
	{
2407 2408 2409 2410
#ifdef NOT_USED
		/* this is redundant with an Assert in HeapTupleSetOid */
		Assert(newtup->t_data->t_infomask & HEAP_HASOID);
#endif
2411 2412
		HeapTupleSetOid(newtup, HeapTupleGetOid(&oldtup));
	}
2413 2414 2415 2416 2417 2418
	else
	{
		/* check there is not space for an OID */
		Assert(!(newtup->t_data->t_infomask & HEAP_HASOID));
	}

2419
	newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2420
	newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2421
	newtup->t_data->t_infomask |= (HEAP_XMAX_INVALID | HEAP_UPDATED);
2422
	HeapTupleHeaderSetXmin(newtup->t_data, xid);
2423
	HeapTupleHeaderSetCmin(newtup->t_data, cid);
2424 2425 2426 2427 2428 2429 2430
	HeapTupleHeaderSetXmax(newtup->t_data, 0);	/* for cleanliness */

	/*
	 * Replace cid with a combo cid if necessary.  Note that we already put
	 * the plain cid into the new tuple.
	 */
	HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
2431

2432
	/*
2433
	 * If the toaster needs to be activated, OR if the new tuple will not fit
2434
	 * on the same page as the old, then we need to release the content lock
2435 2436 2437 2438
	 * (but not the pin!) on the old tuple's buffer while we are off doing
	 * TOAST and/or table-file-extension work.	We must mark the old tuple to
	 * show that it's already being updated, else other processes may try to
	 * update it themselves.
2439
	 *
2440 2441
	 * We need to invoke the toaster if there are already any out-of-line
	 * toasted values present, or if the new tuple is over-threshold.
2442
	 */
2443
	if (relation->rd_rel->relkind != RELKIND_RELATION)
2444 2445 2446 2447 2448 2449 2450 2451 2452 2453
	{
		/* toast table entries should never be recursively toasted */
		Assert(!HeapTupleHasExternal(&oldtup));
		Assert(!HeapTupleHasExternal(newtup));
		need_toast = false;
	}
	else
		need_toast = (HeapTupleHasExternal(&oldtup) ||
					  HeapTupleHasExternal(newtup) ||
					  newtup->t_len > TOAST_TUPLE_THRESHOLD);
2454

2455
	pagefree = PageGetHeapFreeSpace((Page) dp);
2456

2457 2458
	newtupsize = MAXALIGN(newtup->t_len);

2459
	if (need_toast || newtupsize > pagefree)
2460
	{
2461
		/* Clear obsolete visibility flags ... */
2462
		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
2463
									   HEAP_XMAX_INVALID |
2464 2465
									   HEAP_XMAX_IS_MULTI |
									   HEAP_IS_LOCKED |
2466
									   HEAP_MOVED);
2467 2468
		HeapTupleClearHotUpdated(&oldtup);
		/* ... and store info about transaction updating this tuple */
2469
		HeapTupleHeaderSetXmax(oldtup.t_data, xid);
2470
		HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
2471 2472
		/* temporarily make it look not-updated */
		oldtup.t_data->t_ctid = oldtup.t_self;
2473
		already_marked = true;
2474
		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2475

2476 2477 2478 2479 2480
		/*
		 * Let the toaster do its thing, if needed.
		 *
		 * Note: below this point, heaptup is the data we actually intend to
		 * store into the relation; newtup is the caller's original untoasted
2481
		 * data.
2482
		 */
2483
		if (need_toast)
2484
		{
2485 2486 2487
			/* Note we always use WAL and FSM during updates */
			heaptup = toast_insert_or_update(relation, newtup, &oldtup,
											 true, true);
2488
			newtupsize = MAXALIGN(heaptup->t_len);
2489
		}
2490 2491
		else
			heaptup = newtup;
2492

2493
		/*
2494 2495 2496 2497 2498 2499
		 * Now, do we need a new page for the tuple, or not?  This is a bit
		 * tricky since someone else could have added tuples to the page while
		 * we weren't looking.  We have to recheck the available space after
		 * reacquiring the buffer lock.  But don't bother to do that if the
		 * former amount of free space is still not enough; it's unlikely
		 * there's more free now than before.
2500 2501
		 *
		 * What's more, if we need to get a new page, we will need to acquire
2502 2503 2504 2505 2506 2507 2508
		 * buffer locks on both old and new pages.	To avoid deadlock against
		 * some other backend trying to get the same two locks in the other
		 * order, we must be consistent about the order we get the locks in.
		 * We use the rule "lock the lower-numbered page of the relation
		 * first".  To implement this, we must do RelationGetBufferForTuple
		 * while not holding the lock on the old page, and we must rely on it
		 * to get the locks on both pages in the correct order.
2509 2510 2511
		 */
		if (newtupsize > pagefree)
		{
2512 2513
			/* Assume there's no chance to put heaptup on same page. */
			newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
2514
											   buffer, true);
2515
		}
2516
		else
2517 2518 2519 2520
		{
			/* Re-acquire the lock on the old tuple's page. */
			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
			/* Re-check using the up-to-date free space */
2521
			pagefree = PageGetHeapFreeSpace((Page) dp);
2522 2523 2524 2525
			if (newtupsize > pagefree)
			{
				/*
				 * Rats, it doesn't fit anymore.  We must now unlock and
2526 2527
				 * relock to avoid deadlock.  Fortunately, this path should
				 * seldom be taken.
2528 2529
				 */
				LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2530
				newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
2531
												   buffer, true);
2532 2533 2534 2535 2536 2537 2538
			}
			else
			{
				/* OK, it fits here, so we're done. */
				newbuf = buffer;
			}
		}
2539
	}
2540 2541 2542 2543 2544
	else
	{
		/* No TOAST work needed, and it'll fit on same page */
		already_marked = false;
		newbuf = buffer;
2545
		heaptup = newtup;
2546
	}
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2547

2548
	/*
2549 2550 2551
	 * At this point newbuf and buffer are both pinned and locked, and newbuf
	 * has enough space for the new tuple.	If they are the same buffer, only
	 * one pin is held.
2552 2553
	 */

2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569
	if (newbuf == buffer)
	{
		/*
		 * Since the new tuple is going into the same page, we might be able
		 * to do a HOT update.  Check if any of the index columns have been
		 * changed.  If not, then HOT update is possible.
		 */
		if (HeapSatisfiesHOTUpdate(relation, hot_attrs, &oldtup, heaptup))
			use_hot_update = true;
	}
	else
	{
		/* Set a hint that the old page could use prune/defrag */
		PageSetFull(dp);
	}

2570
	/* NO EREPORT(ERROR) from here till changes are logged */
2571
	START_CRIT_SECTION();
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2572

2573 2574
	/*
	 * If this transaction commits, the old tuple will become DEAD sooner or
2575 2576 2577 2578
	 * later.  Set flag that this page is a candidate for pruning once our xid
	 * falls below the OldestXmin horizon.  If the transaction finally aborts,
	 * the subsequent page pruning will be a no-op and the hint will be
	 * cleared.
2579 2580 2581 2582
	 *
	 * XXX Should we set hint on newbuf as well?  If the transaction
	 * aborts, there would be a prunable tuple in the newbuf; but for now
	 * we choose not to optimize for aborts.  Note that heap_xlog_update
2583
	 * must be kept in sync if this decision changes.
2584
	 */
2585
	PageSetPrunable(dp, xid);
2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603

	if (use_hot_update)
	{
		/* Mark the old tuple as HOT-updated */
		HeapTupleSetHotUpdated(&oldtup);
		/* And mark the new tuple as heap-only */
		HeapTupleSetHeapOnly(heaptup);
		/* Mark the caller's copy too, in case different from heaptup */
		HeapTupleSetHeapOnly(newtup);
	}
	else
	{
		/* Make sure tuples are correctly marked as not-HOT */
		HeapTupleClearHotUpdated(&oldtup);
		HeapTupleClearHeapOnly(heaptup);
		HeapTupleClearHeapOnly(newtup);
	}

2604
	RelationPutHeapTuple(relation, newbuf, heaptup);	/* insert new tuple */
2605

2606
	if (!already_marked)
2607
	{
2608
		/* Clear obsolete visibility flags ... */
2609 2610
		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
									   HEAP_XMAX_INVALID |
2611 2612
									   HEAP_XMAX_IS_MULTI |
									   HEAP_IS_LOCKED |
2613
									   HEAP_MOVED);
2614
		/* ... and store info about transaction updating this tuple */
2615
		HeapTupleHeaderSetXmax(oldtup.t_data, xid);
2616
		HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
2617
	}
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2618

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2619
	/* record address of new tuple in t_ctid of old one */
2620
	oldtup.t_data->t_ctid = heaptup->t_self;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2621

2622 2623 2624 2625
	if (newbuf != buffer)
		MarkBufferDirty(newbuf);
	MarkBufferDirty(buffer);

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2626
	/* XLOG stuff */
2627
	if (!relation->rd_istemp)
2628
	{
2629
		XLogRecPtr	recptr = log_heap_update(relation, buffer, oldtup.t_self,
2630
											 newbuf, heaptup, false);
Bruce Momjian's avatar
Bruce Momjian committed
2631

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2632 2633
		if (newbuf != buffer)
		{
2634
			PageSetLSN(BufferGetPage(newbuf), recptr);
2635
			PageSetTLI(BufferGetPage(newbuf), ThisTimeLineID);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2636
		}
2637
		PageSetLSN(BufferGetPage(buffer), recptr);
2638
		PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
2639
	}
2640

2641
	END_CRIT_SECTION();
2642

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2643 2644
	if (newbuf != buffer)
		LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2645 2646
	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);

2647 2648
	/*
	 * Mark old tuple for invalidation from system caches at next command
2649 2650
	 * boundary. We have to do this before releasing the buffer because we
	 * need to look at the contents of the tuple.
2651
	 */
2652
	CacheInvalidateHeapTuple(relation, &oldtup);
2653

2654
	/* Now we can release the buffer(s) */
2655
	if (newbuf != buffer)
2656 2657
		ReleaseBuffer(newbuf);
	ReleaseBuffer(buffer);
2658 2659

	/*
2660
	 * If new tuple is cachable, mark it for invalidation from the caches in
2661 2662 2663
	 * case we abort.  Note it is OK to do this after releasing the buffer,
	 * because the heaptup data structure is all in local memory, not in the
	 * shared buffer.
2664
	 */
2665
	CacheInvalidateHeapTuple(relation, heaptup);
2666

2667 2668 2669 2670 2671 2672
	/*
	 * Release the lmgr tuple lock, if we had it.
	 */
	if (have_tuple_lock)
		UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);

2673
	pgstat_count_heap_update(relation, use_hot_update);
2674

2675 2676 2677 2678 2679 2680 2681 2682 2683 2684
	/*
	 * If heaptup is a private copy, release it.  Don't forget to copy t_self
	 * back to the caller's image, too.
	 */
	if (heaptup != newtup)
	{
		newtup->t_self = heaptup->t_self;
		heap_freetuple(heaptup);
	}

2685 2686
	bms_free(hot_attrs);

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2687 2688 2689
	return HeapTupleMayBeUpdated;
}

2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797
/*
 * Check if the specified attribute's value is same in both given tuples.
 * Subroutine for HeapSatisfiesHOTUpdate.
 */
static bool
heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
					   HeapTuple tup1, HeapTuple tup2)
{
	Datum value1, value2;
	bool isnull1, isnull2;
	Form_pg_attribute att;

	/*
	 * If it's a whole-tuple reference, say "not equal".  It's not really
	 * worth supporting this case, since it could only succeed after a
	 * no-op update, which is hardly a case worth optimizing for.
	 */
	if (attrnum == 0)
		return false;

	/*
	 * Likewise, automatically say "not equal" for any system attribute
	 * other than OID and tableOID; we cannot expect these to be consistent
	 * in a HOT chain, or even to be set correctly yet in the new tuple.
	 */
	if (attrnum < 0)
	{
		if (attrnum != ObjectIdAttributeNumber &&
			attrnum != TableOidAttributeNumber)
			return false;
	}

	/*
	 * Extract the corresponding values.  XXX this is pretty inefficient
	 * if there are many indexed columns.  Should HeapSatisfiesHOTUpdate
	 * do a single heap_deform_tuple call on each tuple, instead?  But
	 * that doesn't work for system columns ...
	 */
	value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
	value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);

	/*
	 * If one value is NULL and other is not, then they are certainly
	 * not equal
	 */
	if (isnull1 != isnull2)
		return false;

	/*
	 * If both are NULL, they can be considered equal.
	 */
	if (isnull1)
		return true;

	/*
	 * We do simple binary comparison of the two datums.  This may be overly
	 * strict because there can be multiple binary representations for the
	 * same logical value.  But we should be OK as long as there are no false
	 * positives.  Using a type-specific equality operator is messy because
	 * there could be multiple notions of equality in different operator
	 * classes; furthermore, we cannot safely invoke user-defined functions
	 * while holding exclusive buffer lock.
	 */
	if (attrnum <= 0)
	{
		/* The only allowed system columns are OIDs, so do this */
		return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
	}
	else
	{
		Assert(attrnum <= tupdesc->natts);
		att	= tupdesc->attrs[attrnum - 1];
		return datumIsEqual(value1, value2, att->attbyval, att->attlen);
	}
}

/*
 * Check if the old and new tuples represent a HOT-safe update. To be able
 * to do a HOT update, we must not have changed any columns used in index
 * definitions.
 *
 * The set of attributes to be checked is passed in (we dare not try to
 * compute it while holding exclusive buffer lock...)  NOTE that hot_attrs
 * is destructively modified!  That is OK since this is invoked at most once
 * by heap_update().
 *
 * Returns true if safe to do HOT update.
 */
static bool
HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
					   HeapTuple oldtup, HeapTuple newtup)
{
	int attrnum;

	while ((attrnum = bms_first_member(hot_attrs)) >= 0)
	{
		/* Adjust for system attributes */
		attrnum += FirstLowInvalidHeapAttributeNumber;

		/* If the attribute value has changed, we can't do HOT update */
		if (!heap_tuple_attr_equals(RelationGetDescr(relation), attrnum,
									oldtup, newtup))
			return false;
	}

	return true;
}

2798 2799 2800 2801 2802
/*
 *	simple_heap_update - replace a tuple
 *
 * This routine may be used to update a tuple when concurrent updates of
 * the target tuple are not expected (for example, because we have a lock
2803
 * on the relation associated with the tuple).	Any failure is reported
2804
 * via ereport().
2805 2806 2807 2808
 */
void
simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
{
2809
	HTSU_Result result;
2810 2811
	ItemPointerData update_ctid;
	TransactionId update_xmax;
2812

2813
	result = heap_update(relation, otid, tup,
2814
						 &update_ctid, &update_xmax,
2815
						 GetCurrentCommandId(), InvalidSnapshot,
Bruce Momjian's avatar
Bruce Momjian committed
2816
						 true /* wait for commit */ );
2817 2818 2819 2820
	switch (result)
	{
		case HeapTupleSelfUpdated:
			/* Tuple was already updated in current command? */
2821
			elog(ERROR, "tuple already updated by self");
2822 2823 2824 2825 2826 2827 2828
			break;

		case HeapTupleMayBeUpdated:
			/* done successfully */
			break;

		case HeapTupleUpdated:
2829
			elog(ERROR, "tuple concurrently updated");
2830 2831 2832
			break;

		default:
2833
			elog(ERROR, "unrecognized heap_update status: %u", result);
2834 2835 2836 2837
			break;
	}
}

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2838
/*
2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866
 *	heap_lock_tuple - lock a tuple in shared or exclusive mode
 *
 * Note that this acquires a buffer pin, which the caller must release.
 *
 * Input parameters:
 *	relation: relation containing tuple (caller must hold suitable lock)
 *	tuple->t_self: TID of tuple to lock (rest of struct need not be valid)
 *	cid: current command ID (used for visibility test, and stored into
 *		tuple's cmax if lock is successful)
 *	mode: indicates if shared or exclusive tuple lock is desired
 *	nowait: if true, ereport rather than blocking if lock not available
 *
 * Output parameters:
 *	*tuple: all fields filled in
 *	*buffer: set to buffer holding tuple (pinned but not locked at exit)
 *	*ctid: set to tuple's t_ctid, but only in failure cases
 *	*update_xmax: set to tuple's xmax, but only in failure cases
 *
 * Function result may be:
 *	HeapTupleMayBeUpdated: lock was successfully acquired
 *	HeapTupleSelfUpdated: lock failed because tuple updated by self
 *	HeapTupleUpdated: lock failed because tuple updated by other xact
 *
 * In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
 * If t_ctid is the same as t_self, the tuple was deleted; if different, the
 * tuple was updated, and t_ctid is the location of the replacement tuple.
 * (t_xmax is needed to verify that the replacement tuple matches.)
 *
2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884
 *
 * NOTES: because the shared-memory lock table is of finite size, but users
 * could reasonably want to lock large numbers of tuples, we do not rely on
 * the standard lock manager to store tuple-level locks over the long term.
 * Instead, a tuple is marked as locked by setting the current transaction's
 * XID as its XMAX, and setting additional infomask bits to distinguish this
 * usage from the more normal case of having deleted the tuple.  When
 * multiple transactions concurrently share-lock a tuple, the first locker's
 * XID is replaced in XMAX with a MultiTransactionId representing the set of
 * XIDs currently holding share-locks.
 *
 * When it is necessary to wait for a tuple-level lock to be released, the
 * basic delay is provided by XactLockTableWait or MultiXactIdWait on the
 * contents of the tuple's XMAX.  However, that mechanism will release all
 * waiters concurrently, so there would be a race condition as to which
 * waiter gets the tuple, potentially leading to indefinite starvation of
 * some waiters.  The possibility of share-locking makes the problem much
 * worse --- a steady stream of share-lockers can easily block an exclusive
2885
 * locker forever.	To provide more reliable semantics about who gets a
2886 2887 2888 2889 2890 2891 2892
 * tuple-level lock first, we use the standard lock manager.  The protocol
 * for waiting for a tuple-level lock is really
 *		LockTuple()
 *		XactLockTableWait()
 *		mark tuple as locked by me
 *		UnlockTuple()
 * When there are multiple waiters, arbitration of who is to get the lock next
2893
 * is provided by LockTuple().	However, at most one tuple-level lock will
2894 2895 2896 2897 2898
 * be held or awaited per backend at any time, so we don't risk overflow
 * of the lock table.  Note that incoming share-lockers are required to
 * do LockTuple as well, if there is any conflict, to ensure that they don't
 * starve out waiting exclusive-lockers.  However, if there is not any active
 * conflict for a tuple, we don't incur any extra overhead.
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2899
 */
2900
HTSU_Result
2901
heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer *buffer,
2902 2903
				ItemPointer ctid, TransactionId *update_xmax,
				CommandId cid, LockTupleMode mode, bool nowait)
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2904
{
2905
	HTSU_Result result;
Bruce Momjian's avatar
Bruce Momjian committed
2906 2907 2908
	ItemPointer tid = &(tuple->t_self);
	ItemId		lp;
	PageHeader	dp;
2909
	TransactionId xid;
2910 2911
	TransactionId xmax;
	uint16		old_infomask;
2912
	uint16		new_infomask;
2913 2914 2915 2916
	LOCKMODE	tuple_lock_type;
	bool		have_tuple_lock = false;

	tuple_lock_type = (mode == LockTupleShared) ? ShareLock : ExclusiveLock;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2917 2918 2919 2920 2921 2922

	*buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);

	dp = (PageHeader) BufferGetPage(*buffer);
	lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(tid));
2923
	Assert(ItemIdIsNormal(lp));
2924

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2925 2926
	tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
	tuple->t_len = ItemIdGetLength(lp);
2927
	tuple->t_tableOid = RelationGetRelid(relation);
Bruce Momjian's avatar
Bruce Momjian committed
2928

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2929
l3:
2930
	result = HeapTupleSatisfiesUpdate(tuple->t_data, cid, *buffer);
Bruce Momjian's avatar
Bruce Momjian committed
2931

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2932 2933
	if (result == HeapTupleInvisible)
	{
2934
		UnlockReleaseBuffer(*buffer);
2935
		elog(ERROR, "attempted to lock invisible tuple");
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2936 2937 2938
	}
	else if (result == HeapTupleBeingUpdated)
	{
2939
		TransactionId xwait;
2940
		uint16		infomask;
2941 2942 2943 2944 2945 2946 2947

		/* must copy state data before unlocking buffer */
		xwait = HeapTupleHeaderGetXmax(tuple->t_data);
		infomask = tuple->t_data->t_infomask;

		LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);

2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966
		/*
		 * If we wish to acquire share lock, and the tuple is already
		 * share-locked by a multixact that includes any subtransaction of the
		 * current top transaction, then we effectively hold the desired lock
		 * already.  We *must* succeed without trying to take the tuple lock,
		 * else we will deadlock against anyone waiting to acquire exclusive
		 * lock.  We don't need to make any state changes in this case.
		 */
		if (mode == LockTupleShared &&
			(infomask & HEAP_XMAX_IS_MULTI) &&
			MultiXactIdIsCurrent((MultiXactId) xwait))
		{
			Assert(infomask & HEAP_XMAX_SHARED_LOCK);
			/* Probably can't hold tuple lock here, but may as well check */
			if (have_tuple_lock)
				UnlockTuple(relation, tid, tuple_lock_type);
			return HeapTupleMayBeUpdated;
		}

2967 2968
		/*
		 * Acquire tuple lock to establish our priority for the tuple.
2969 2970
		 * LockTuple will release us when we are next-in-line for the tuple.
		 * We must do this even if we are share-locking.
2971
		 *
2972 2973
		 * If we are forced to "start over" below, we keep the tuple lock;
		 * this arranges that we stay at the head of the line while rechecking
2974
		 * tuple state.
2975 2976
		 */
		if (!have_tuple_lock)
2977
		{
2978 2979 2980 2981 2982
			if (nowait)
			{
				if (!ConditionalLockTuple(relation, tid, tuple_lock_type))
					ereport(ERROR,
							(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
2983 2984
					errmsg("could not obtain lock on row in relation \"%s\"",
						   RelationGetRelationName(relation))));
2985 2986 2987
			}
			else
				LockTuple(relation, tid, tuple_lock_type);
2988 2989
			have_tuple_lock = true;
		}
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2990

2991 2992
		if (mode == LockTupleShared && (infomask & HEAP_XMAX_SHARED_LOCK))
		{
2993
			/*
2994 2995
			 * Acquiring sharelock when there's at least one sharelocker
			 * already.  We need not wait for him/them to complete.
2996
			 */
2997
			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
2998

2999
			/*
3000 3001
			 * Make sure it's still a shared lock, else start over.  (It's OK
			 * if the ownership of the shared lock has changed, though.)
3002 3003 3004 3005 3006 3007 3008
			 */
			if (!(tuple->t_data->t_infomask & HEAP_XMAX_SHARED_LOCK))
				goto l3;
		}
		else if (infomask & HEAP_XMAX_IS_MULTI)
		{
			/* wait for multixact to end */
3009 3010 3011 3012 3013
			if (nowait)
			{
				if (!ConditionalMultiXactIdWait((MultiXactId) xwait))
					ereport(ERROR,
							(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
3014 3015
					errmsg("could not obtain lock on row in relation \"%s\"",
						   RelationGetRelationName(relation))));
3016 3017 3018 3019
			}
			else
				MultiXactIdWait((MultiXactId) xwait);

3020
			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
3021

3022
			/*
3023 3024 3025
			 * If xwait had just locked the tuple then some other xact could
			 * update this tuple before we get to this point. Check for xmax
			 * change, and start over if so.
3026 3027 3028 3029 3030
			 */
			if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
				!TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
									 xwait))
				goto l3;
3031

3032
			/*
3033 3034 3035 3036 3037 3038
			 * You might think the multixact is necessarily done here, but not
			 * so: it could have surviving members, namely our own xact or
			 * other subxacts of this backend.	It is legal for us to lock the
			 * tuple in either case, however.  We don't bother changing the
			 * on-disk hint bits since we are about to overwrite the xmax
			 * altogether.
3039 3040 3041 3042 3043
			 */
		}
		else
		{
			/* wait for regular transaction to end */
3044 3045 3046 3047 3048
			if (nowait)
			{
				if (!ConditionalXactLockTableWait(xwait))
					ereport(ERROR,
							(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
3049 3050
					errmsg("could not obtain lock on row in relation \"%s\"",
						   RelationGetRelationName(relation))));
3051 3052 3053 3054
			}
			else
				XactLockTableWait(xwait);

3055
			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
3056 3057

			/*
3058 3059 3060
			 * xwait is done, but if xwait had just locked the tuple then some
			 * other xact could update this tuple before we get to this point.
			 * Check for xmax change, and start over if so.
3061
			 */
3062 3063 3064 3065 3066
			if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
				!TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
									 xwait))
				goto l3;

3067 3068
			/* Otherwise check if it committed or aborted */
			UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3069
		}
3070 3071

		/*
3072 3073 3074 3075
		 * We may lock if previous xmax aborted, or if it committed but only
		 * locked the tuple without updating it.  The case where we didn't
		 * wait because we are joining an existing shared lock is correctly
		 * handled, too.
3076 3077 3078 3079 3080 3081
		 */
		if (tuple->t_data->t_infomask & (HEAP_XMAX_INVALID |
										 HEAP_IS_LOCKED))
			result = HeapTupleMayBeUpdated;
		else
			result = HeapTupleUpdated;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3082
	}
3083

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3084 3085 3086
	if (result != HeapTupleMayBeUpdated)
	{
		Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated);
3087 3088 3089
		Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
		*ctid = tuple->t_data->t_ctid;
		*update_xmax = HeapTupleHeaderGetXmax(tuple->t_data);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3090
		LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
3091 3092
		if (have_tuple_lock)
			UnlockTuple(relation, tid, tuple_lock_type);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3093 3094 3095
		return result;
	}

3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124
	/*
	 * We might already hold the desired lock (or stronger), possibly under
	 * a different subtransaction of the current top transaction.  If so,
	 * there is no need to change state or issue a WAL record.  We already
	 * handled the case where this is true for xmax being a MultiXactId,
	 * so now check for cases where it is a plain TransactionId.
	 *
	 * Note in particular that this covers the case where we already hold
	 * exclusive lock on the tuple and the caller only wants shared lock.
	 * It would certainly not do to give up the exclusive lock.
	 */
	xmax = HeapTupleHeaderGetXmax(tuple->t_data);
	old_infomask = tuple->t_data->t_infomask;

	if (!(old_infomask & (HEAP_XMAX_INVALID |
						  HEAP_XMAX_COMMITTED |
						  HEAP_XMAX_IS_MULTI)) &&
		(mode == LockTupleShared ?
		 (old_infomask & HEAP_IS_LOCKED) :
		 (old_infomask & HEAP_XMAX_EXCL_LOCK)) &&
		TransactionIdIsCurrentTransactionId(xmax))
	{
		LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
		/* Probably can't hold tuple lock here, but may as well check */
		if (have_tuple_lock)
			UnlockTuple(relation, tid, tuple_lock_type);
		return HeapTupleMayBeUpdated;
	}

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3125
	/*
3126 3127 3128
	 * Compute the new xmax and infomask to store into the tuple.  Note we do
	 * not modify the tuple just yet, because that would leave it in the wrong
	 * state if multixact.c elogs.
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3129
	 */
3130 3131
	xid = GetCurrentTransactionId();

3132 3133 3134 3135 3136
	new_infomask = old_infomask & ~(HEAP_XMAX_COMMITTED |
									HEAP_XMAX_INVALID |
									HEAP_XMAX_IS_MULTI |
									HEAP_IS_LOCKED |
									HEAP_MOVED);
3137 3138 3139 3140 3141

	if (mode == LockTupleShared)
	{
		/*
		 * If this is the first acquisition of a shared lock in the current
3142 3143 3144 3145 3146
		 * transaction, set my per-backend OldestMemberMXactId setting. We can
		 * be certain that the transaction will never become a member of any
		 * older MultiXactIds than that.  (We have to do this even if we end
		 * up just using our own TransactionId below, since some other backend
		 * could incorporate our XID into a MultiXact immediately afterwards.)
3147 3148 3149 3150 3151 3152 3153 3154 3155
		 */
		MultiXactIdSetOldestMember();

		new_infomask |= HEAP_XMAX_SHARED_LOCK;

		/*
		 * Check to see if we need a MultiXactId because there are multiple
		 * lockers.
		 *
3156 3157 3158 3159
		 * HeapTupleSatisfiesUpdate will have set the HEAP_XMAX_INVALID bit if
		 * the xmax was a MultiXactId but it was not running anymore. There is
		 * a race condition, which is that the MultiXactId may have finished
		 * since then, but that uncommon case is handled within
3160 3161
		 * MultiXactIdExpand.
		 *
3162 3163
		 * There is a similar race condition possible when the old xmax was a
		 * regular TransactionId.  We test TransactionIdIsInProgress again
3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174
		 * just to narrow the window, but it's still possible to end up
		 * creating an unnecessary MultiXactId.  Fortunately this is harmless.
		 */
		if (!(old_infomask & (HEAP_XMAX_INVALID | HEAP_XMAX_COMMITTED)))
		{
			if (old_infomask & HEAP_XMAX_IS_MULTI)
			{
				/*
				 * If the XMAX is already a MultiXactId, then we need to
				 * expand it to include our own TransactionId.
				 */
3175
				xid = MultiXactIdExpand((MultiXactId) xmax, xid);
3176 3177 3178 3179
				new_infomask |= HEAP_XMAX_IS_MULTI;
			}
			else if (TransactionIdIsInProgress(xmax))
			{
3180 3181 3182 3183 3184 3185 3186
				/*
				 * If the XMAX is a valid TransactionId, then we need to
				 * create a new MultiXactId that includes both the old
				 * locker and our own TransactionId.
				 */
				xid = MultiXactIdCreate(xmax, xid);
				new_infomask |= HEAP_XMAX_IS_MULTI;
3187 3188 3189 3190
			}
			else
			{
				/*
3191 3192 3193
				 * Can get here iff HeapTupleSatisfiesUpdate saw the old xmax
				 * as running, but it finished before
				 * TransactionIdIsInProgress() got to run.	Treat it like
3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212
				 * there's no locker in the tuple.
				 */
			}
		}
		else
		{
			/*
			 * There was no previous locker, so just insert our own
			 * TransactionId.
			 */
		}
	}
	else
	{
		/* We want an exclusive lock on the tuple */
		new_infomask |= HEAP_XMAX_EXCL_LOCK;
	}

	START_CRIT_SECTION();
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3213

3214 3215 3216
	/*
	 * Store transaction information of xact locking the tuple.
	 *
3217 3218
	 * Note: Cmax is meaningless in this context, so don't set it; this
	 * avoids possibly generating a useless combo CID.
3219 3220
	 */
	tuple->t_data->t_infomask = new_infomask;
3221
	HeapTupleHeaderClearHotUpdated(tuple->t_data);
3222
	HeapTupleHeaderSetXmax(tuple->t_data, xid);
3223 3224
	/* Make sure there is no forward chain link in t_ctid */
	tuple->t_data->t_ctid = *tid;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3225

3226 3227
	MarkBufferDirty(*buffer);

3228
	/*
3229 3230
	 * XLOG stuff.	You might think that we don't need an XLOG record because
	 * there is no state change worth restoring after a crash.	You would be
3231 3232 3233 3234 3235 3236
	 * wrong however: we have just written either a TransactionId or a
	 * MultiXactId that may never have been seen on disk before, and we need
	 * to make sure that there are XLOG entries covering those ID numbers.
	 * Else the same IDs might be re-used after a crash, which would be
	 * disastrous if this page made it to disk before the crash.  Essentially
	 * we have to enforce the WAL log-before-data rule even in this case.
3237 3238
	 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
	 * entries for everything anyway.)
3239 3240 3241 3242 3243 3244 3245 3246 3247
	 */
	if (!relation->rd_istemp)
	{
		xl_heap_lock xlrec;
		XLogRecPtr	recptr;
		XLogRecData rdata[2];

		xlrec.target.node = relation->rd_node;
		xlrec.target.tid = tuple->t_self;
3248 3249
		xlrec.locking_xid = xid;
		xlrec.xid_is_mxact = ((new_infomask & HEAP_XMAX_IS_MULTI) != 0);
3250 3251 3252
		xlrec.shared_lock = (mode == LockTupleShared);
		rdata[0].data = (char *) &xlrec;
		rdata[0].len = SizeOfHeapLock;
3253
		rdata[0].buffer = InvalidBuffer;
3254 3255 3256 3257
		rdata[0].next = &(rdata[1]);

		rdata[1].data = NULL;
		rdata[1].len = 0;
3258 3259
		rdata[1].buffer = *buffer;
		rdata[1].buffer_std = true;
3260 3261 3262 3263 3264 3265 3266 3267 3268 3269
		rdata[1].next = NULL;

		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK, rdata);

		PageSetLSN(dp, recptr);
		PageSetTLI(dp, ThisTimeLineID);
	}

	END_CRIT_SECTION();

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3270 3271
	LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);

3272 3273 3274 3275 3276 3277 3278
	/*
	 * Now that we have successfully marked the tuple as locked, we can
	 * release the lmgr tuple lock, if we had it.
	 */
	if (have_tuple_lock)
		UnlockTuple(relation, tid, tuple_lock_type);

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3279
	return HeapTupleMayBeUpdated;
3280 3281
}

3282 3283 3284 3285 3286

/*
 * heap_inplace_update - update a tuple "in place" (ie, overwrite it)
 *
 * Overwriting violates both MVCC and transactional safety, so the uses
Bruce Momjian's avatar
Bruce Momjian committed
3287
 * of this function in Postgres are extremely limited.	Nonetheless we
3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316
 * find some places to use it.
 *
 * The tuple cannot change size, and therefore it's reasonable to assume
 * that its null bitmap (if any) doesn't change either.  So we just
 * overwrite the data portion of the tuple without touching the null
 * bitmap or any of the header fields.
 *
 * tuple is an in-memory tuple structure containing the data to be written
 * over the target tuple.  Also, tuple->t_self identifies the target tuple.
 */
void
heap_inplace_update(Relation relation, HeapTuple tuple)
{
	Buffer		buffer;
	Page		page;
	OffsetNumber offnum;
	ItemId		lp = NULL;
	HeapTupleHeader htup;
	uint32		oldlen;
	uint32		newlen;

	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
	page = (Page) BufferGetPage(buffer);

	offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
	if (PageGetMaxOffsetNumber(page) >= offnum)
		lp = PageGetItemId(page, offnum);

3317
	if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372
		elog(ERROR, "heap_inplace_update: invalid lp");

	htup = (HeapTupleHeader) PageGetItem(page, lp);

	oldlen = ItemIdGetLength(lp) - htup->t_hoff;
	newlen = tuple->t_len - tuple->t_data->t_hoff;
	if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
		elog(ERROR, "heap_inplace_update: wrong tuple length");

	/* NO EREPORT(ERROR) from here till changes are logged */
	START_CRIT_SECTION();

	memcpy((char *) htup + htup->t_hoff,
		   (char *) tuple->t_data + tuple->t_data->t_hoff,
		   newlen);

	MarkBufferDirty(buffer);

	/* XLOG stuff */
	if (!relation->rd_istemp)
	{
		xl_heap_inplace xlrec;
		XLogRecPtr	recptr;
		XLogRecData rdata[2];

		xlrec.target.node = relation->rd_node;
		xlrec.target.tid = tuple->t_self;

		rdata[0].data = (char *) &xlrec;
		rdata[0].len = SizeOfHeapInplace;
		rdata[0].buffer = InvalidBuffer;
		rdata[0].next = &(rdata[1]);

		rdata[1].data = (char *) htup + htup->t_hoff;
		rdata[1].len = newlen;
		rdata[1].buffer = buffer;
		rdata[1].buffer_std = true;
		rdata[1].next = NULL;

		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE, rdata);

		PageSetLSN(page, recptr);
		PageSetTLI(page, ThisTimeLineID);
	}

	END_CRIT_SECTION();

	UnlockReleaseBuffer(buffer);

	/* Send out shared cache inval if necessary */
	if (!IsBootstrapProcessingMode())
		CacheInvalidateHeapTuple(relation, tuple);
}


3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466
/*
 * heap_freeze_tuple
 *
 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
 * are older than the specified cutoff XID.  If so, replace them with
 * FrozenTransactionId or InvalidTransactionId as appropriate, and return
 * TRUE.  Return FALSE if nothing was changed.
 *
 * It is assumed that the caller has checked the tuple with
 * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
 * (else we should be removing the tuple, not freezing it).
 *
 * NB: cutoff_xid *must* be <= the current global xmin, to ensure that any
 * XID older than it could neither be running nor seen as running by any
 * open transaction.  This ensures that the replacement will not change
 * anyone's idea of the tuple state.  Also, since we assume the tuple is
 * not HEAPTUPLE_DEAD, the fact that an XID is not still running allows us
 * to assume that it is either committed good or aborted, as appropriate;
 * so we need no external state checks to decide what to do.  (This is good
 * because this function is applied during WAL recovery, when we don't have
 * access to any such state, and can't depend on the hint bits to be set.)
 *
 * In lazy VACUUM, we call this while initially holding only a shared lock
 * on the tuple's buffer.  If any change is needed, we trade that in for an
 * exclusive lock before making the change.  Caller should pass the buffer ID
 * if shared lock is held, InvalidBuffer if exclusive lock is already held.
 *
 * Note: it might seem we could make the changes without exclusive lock, since
 * TransactionId read/write is assumed atomic anyway.  However there is a race
 * condition: someone who just fetched an old XID that we overwrite here could
 * conceivably not finish checking the XID against pg_clog before we finish
 * the VACUUM and perhaps truncate off the part of pg_clog he needs.  Getting
 * exclusive lock ensures no other backend is in process of checking the
 * tuple status.  Also, getting exclusive lock makes it safe to adjust the
 * infomask bits.
 */
bool
heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
				  Buffer buf)
{
	bool		changed = false;
	TransactionId xid;

	xid = HeapTupleHeaderGetXmin(tuple);
	if (TransactionIdIsNormal(xid) &&
		TransactionIdPrecedes(xid, cutoff_xid))
	{
		if (buf != InvalidBuffer)
		{
			/* trade in share lock for exclusive lock */
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
			buf = InvalidBuffer;
		}
		HeapTupleHeaderSetXmin(tuple, FrozenTransactionId);
		/*
		 * Might as well fix the hint bits too; usually XMIN_COMMITTED will
		 * already be set here, but there's a small chance not.
		 */
		Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
		tuple->t_infomask |= HEAP_XMIN_COMMITTED;
		changed = true;
	}

	/*
	 * When we release shared lock, it's possible for someone else to change
	 * xmax before we get the lock back, so repeat the check after acquiring
	 * exclusive lock.  (We don't need this pushup for xmin, because only
	 * VACUUM could be interested in changing an existing tuple's xmin,
	 * and there's only one VACUUM allowed on a table at a time.)
	 */
recheck_xmax:
	if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI))
	{
		xid = HeapTupleHeaderGetXmax(tuple);
		if (TransactionIdIsNormal(xid) &&
			TransactionIdPrecedes(xid, cutoff_xid))
		{
			if (buf != InvalidBuffer)
			{
				/* trade in share lock for exclusive lock */
				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
				buf = InvalidBuffer;
				goto recheck_xmax;			/* see comment above */
			}
			HeapTupleHeaderSetXmax(tuple, InvalidTransactionId);
			/*
			 * The tuple might be marked either XMAX_INVALID or
			 * XMAX_COMMITTED + LOCKED.  Normalize to INVALID just to be
			 * sure no one gets confused.
			 */
			tuple->t_infomask &= ~HEAP_XMAX_COMMITTED;
			tuple->t_infomask |= HEAP_XMAX_INVALID;
3467
			HeapTupleHeaderClearHotUpdated(tuple);
3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533
			changed = true;
		}
	}
	else
	{
		/*----------
		 * XXX perhaps someday we should zero out very old MultiXactIds here?
		 *
		 * The only way a stale MultiXactId could pose a problem is if a
		 * tuple, having once been multiply-share-locked, is not touched by
		 * any vacuum or attempted lock or deletion for just over 4G MultiXact
		 * creations, and then in the probably-narrow window where its xmax
		 * is again a live MultiXactId, someone tries to lock or delete it.
		 * Even then, another share-lock attempt would work fine.  An
		 * exclusive-lock or delete attempt would face unexpected delay, or
		 * in the very worst case get a deadlock error.  This seems an
		 * extremely low-probability scenario with minimal downside even if
		 * it does happen, so for now we don't do the extra bookkeeping that
		 * would be needed to clean out MultiXactIds.
		 *----------
		 */
	}

	/*
	 * Although xvac per se could only be set by VACUUM, it shares physical
	 * storage space with cmax, and so could be wiped out by someone setting
	 * xmax.  Hence recheck after changing lock, same as for xmax itself.
	 */
recheck_xvac:
	if (tuple->t_infomask & HEAP_MOVED)
	{
		xid = HeapTupleHeaderGetXvac(tuple);
		if (TransactionIdIsNormal(xid) &&
			TransactionIdPrecedes(xid, cutoff_xid))
		{
			if (buf != InvalidBuffer)
			{
				/* trade in share lock for exclusive lock */
				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
				buf = InvalidBuffer;
				goto recheck_xvac;			/* see comment above */
			}
			/*
			 * If a MOVED_OFF tuple is not dead, the xvac transaction must
			 * have failed; whereas a non-dead MOVED_IN tuple must mean the
			 * xvac transaction succeeded.
			 */
			if (tuple->t_infomask & HEAP_MOVED_OFF)
				HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
			else
				HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
			/*
			 * Might as well fix the hint bits too; usually XMIN_COMMITTED will
			 * already be set here, but there's a small chance not.
			 */
			Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
			tuple->t_infomask |= HEAP_XMIN_COMMITTED;
			changed = true;
		}
	}

	return changed;
}


3534
/* ----------------
3535
 *		heap_markpos	- mark scan position
3536 3537 3538
 * ----------------
 */
void
3539
heap_markpos(HeapScanDesc scan)
3540
{
3541 3542
	/* Note: no locking manipulations needed */

3543
	if (scan->rs_ctup.t_data != NULL)
3544
	{
3545
		scan->rs_mctid = scan->rs_ctup.t_self;
3546 3547 3548
		if (scan->rs_pageatatime)
			scan->rs_mindex = scan->rs_cindex;
	}
3549
	else
3550
		ItemPointerSetInvalid(&scan->rs_mctid);
3551 3552 3553
}

/* ----------------
3554
 *		heap_restrpos	- restore position to marked location
3555 3556 3557
 * ----------------
 */
void
3558
heap_restrpos(HeapScanDesc scan)
3559
{
3560 3561
	/* XXX no amrestrpos checking that ammarkpos called */

3562
	if (!ItemPointerIsValid(&scan->rs_mctid))
3563
	{
3564
		scan->rs_ctup.t_data = NULL;
Bruce Momjian's avatar
Bruce Momjian committed
3565

3566 3567 3568 3569 3570 3571 3572
		/*
		 * unpin scan buffers
		 */
		if (BufferIsValid(scan->rs_cbuf))
			ReleaseBuffer(scan->rs_cbuf);
		scan->rs_cbuf = InvalidBuffer;
		scan->rs_cblock = InvalidBlockNumber;
3573
		scan->rs_inited = false;
3574
	}
3575 3576
	else
	{
3577
		/*
Bruce Momjian's avatar
Bruce Momjian committed
3578
		 * If we reached end of scan, rs_inited will now be false.	We must
3579 3580 3581
		 * reset it to true to keep heapgettup from doing the wrong thing.
		 */
		scan->rs_inited = true;
3582
		scan->rs_ctup.t_self = scan->rs_mctid;
3583 3584 3585 3586
		if (scan->rs_pageatatime)
		{
			scan->rs_cindex = scan->rs_mindex;
			heapgettup_pagemode(scan,
3587
								NoMovementScanDirection,
Bruce Momjian's avatar
Bruce Momjian committed
3588
								0,		/* needn't recheck scan keys */
3589 3590 3591 3592
								NULL);
		}
		else
			heapgettup(scan,
3593
					   NoMovementScanDirection,
Bruce Momjian's avatar
Bruce Momjian committed
3594
					   0,		/* needn't recheck scan keys */
3595
					   NULL);
3596
	}
3597
}
3598

3599 3600 3601
/*
 * Perform XLogInsert for a heap-clean operation.  Caller must already
 * have modified the buffer and marked it dirty.
3602
 *
3603 3604 3605
 * Note: prior to Postgres 8.3, the entries in the nowunused[] array were
 * zero-based tuple indexes.  Now they are one-based like other uses
 * of OffsetNumber.
3606
 */
3607
XLogRecPtr
3608 3609 3610 3611 3612
log_heap_clean(Relation reln, Buffer buffer,
			   OffsetNumber *redirected, int nredirected,
			   OffsetNumber *nowdead, int ndead,
			   OffsetNumber *nowunused, int nunused,
			   bool redirect_move)
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3613
{
3614
	xl_heap_clean xlrec;
3615
	uint8		info;
3616
	XLogRecPtr	recptr;
3617
	XLogRecData rdata[4];
3618

3619 3620 3621
	/* Caller should not call me on a temp relation */
	Assert(!reln->rd_istemp);

3622 3623
	xlrec.node = reln->rd_node;
	xlrec.block = BufferGetBlockNumber(buffer);
3624 3625
	xlrec.nredirected = nredirected;
	xlrec.ndead = ndead;
3626

3627
	rdata[0].data = (char *) &xlrec;
3628
	rdata[0].len = SizeOfHeapClean;
3629
	rdata[0].buffer = InvalidBuffer;
3630
	rdata[0].next = &(rdata[1]);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3631

3632
	/*
3633 3634 3635 3636 3637 3638
	 * The OffsetNumber arrays are not actually in the buffer, but we pretend
	 * that they are.  When XLogInsert stores the whole buffer, the offset
	 * arrays need not be stored too.  Note that even if all three arrays
	 * are empty, we want to expose the buffer as a candidate for whole-page
	 * storage, since this record type implies a defragmentation operation
	 * even if no item pointers changed state.
3639
	 */
3640
	if (nredirected > 0)
3641
	{
3642 3643
		rdata[1].data = (char *) redirected;
		rdata[1].len = nredirected * sizeof(OffsetNumber) * 2;
3644 3645
	}
	else
3646 3647 3648 3649
	{
		rdata[1].data = NULL;
		rdata[1].len = 0;
	}
3650 3651
	rdata[1].buffer = buffer;
	rdata[1].buffer_std = true;
3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680
	rdata[1].next = &(rdata[2]);

	if (ndead > 0)
	{
		rdata[2].data = (char *) nowdead;
		rdata[2].len = ndead * sizeof(OffsetNumber);
	}
	else
	{
		rdata[2].data = NULL;
		rdata[2].len = 0;
	}
	rdata[2].buffer = buffer;
	rdata[2].buffer_std = true;
	rdata[2].next = &(rdata[3]);

	if (nunused > 0)
	{
		rdata[3].data = (char *) nowunused;
		rdata[3].len = nunused * sizeof(OffsetNumber);
	}
	else
	{
		rdata[3].data = NULL;
		rdata[3].len = 0;
	}
	rdata[3].buffer = buffer;
	rdata[3].buffer_std = true;
	rdata[3].next = NULL;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3681

3682 3683
	info = redirect_move ? XLOG_HEAP2_CLEAN_MOVE : XLOG_HEAP2_CLEAN;
	recptr = XLogInsert(RM_HEAP2_ID, info, rdata);
3684

3685
	return recptr;
3686 3687
}

3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736
/*
 * Perform XLogInsert for a heap-freeze operation.  Caller must already
 * have modified the buffer and marked it dirty.
 */
XLogRecPtr
log_heap_freeze(Relation reln, Buffer buffer,
				TransactionId cutoff_xid,
				OffsetNumber *offsets, int offcnt)
{
	xl_heap_freeze xlrec;
	XLogRecPtr	recptr;
	XLogRecData rdata[2];

	/* Caller should not call me on a temp relation */
	Assert(!reln->rd_istemp);

	xlrec.node = reln->rd_node;
	xlrec.block = BufferGetBlockNumber(buffer);
	xlrec.cutoff_xid = cutoff_xid;

	rdata[0].data = (char *) &xlrec;
	rdata[0].len = SizeOfHeapFreeze;
	rdata[0].buffer = InvalidBuffer;
	rdata[0].next = &(rdata[1]);

	/*
	 * The tuple-offsets array is not actually in the buffer, but pretend
	 * that it is.	When XLogInsert stores the whole buffer, the offsets array
	 * need not be stored too.
	 */
	if (offcnt > 0)
	{
		rdata[1].data = (char *) offsets;
		rdata[1].len = offcnt * sizeof(OffsetNumber);
	}
	else
	{
		rdata[1].data = NULL;
		rdata[1].len = 0;
	}
	rdata[1].buffer = buffer;
	rdata[1].buffer_std = true;
	rdata[1].next = NULL;

	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE, rdata);

	return recptr;
}

3737
/*
Bruce Momjian's avatar
Bruce Momjian committed
3738
 * Perform XLogInsert for a heap-update operation.	Caller must already
3739 3740
 * have modified the buffer(s) and marked them dirty.
 */
3741
static XLogRecPtr
3742
log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
3743 3744
				Buffer newbuf, HeapTuple newtup, bool move)
{
3745
	/*
3746 3747 3748 3749
	 * Note: xlhdr is declared to have adequate size and correct alignment for
	 * an xl_heap_header.  However the two tids, if present at all, will be
	 * packed in with no wasted space after the xl_heap_header; they aren't
	 * necessarily aligned as implied by this struct declaration.
3750
	 */
3751 3752 3753 3754 3755
	struct
	{
		xl_heap_header hdr;
		TransactionId tid1;
		TransactionId tid2;
3756
	}			xlhdr;
3757
	int			hsize = SizeOfHeapHeader;
3758
	xl_heap_update xlrec;
3759
	uint8		info;
3760 3761 3762
	XLogRecPtr	recptr;
	XLogRecData rdata[4];
	Page		page = BufferGetPage(newbuf);
3763

3764 3765 3766
	/* Caller should not call me on a temp relation */
	Assert(!reln->rd_istemp);

3767 3768 3769 3770 3771 3772 3773 3774 3775 3776
	if (move)
	{
		Assert(!HeapTupleIsHeapOnly(newtup));
		info = XLOG_HEAP_MOVE;
	}
	else if (HeapTupleIsHeapOnly(newtup))
		info = XLOG_HEAP_HOT_UPDATE;
	else
		info = XLOG_HEAP_UPDATE;

3777 3778 3779
	xlrec.target.node = reln->rd_node;
	xlrec.target.tid = from;
	xlrec.newtid = newtup->t_self;
3780

3781
	rdata[0].data = (char *) &xlrec;
3782
	rdata[0].len = SizeOfHeapUpdate;
3783
	rdata[0].buffer = InvalidBuffer;
3784 3785 3786 3787
	rdata[0].next = &(rdata[1]);

	rdata[1].data = NULL;
	rdata[1].len = 0;
3788 3789
	rdata[1].buffer = oldbuf;
	rdata[1].buffer_std = true;
3790 3791
	rdata[1].next = &(rdata[2]);

3792
	xlhdr.hdr.t_infomask2 = newtup->t_data->t_infomask2;
3793
	xlhdr.hdr.t_infomask = newtup->t_data->t_infomask;
3794
	xlhdr.hdr.t_hoff = newtup->t_data->t_hoff;
3795
	if (move)					/* remember xmax & xmin */
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3796
	{
Bruce Momjian's avatar
Bruce Momjian committed
3797
		TransactionId xid[2];	/* xmax, xmin */
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3798

3799
		if (newtup->t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED))
3800
			xid[0] = InvalidTransactionId;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3801
		else
3802 3803 3804
			xid[0] = HeapTupleHeaderGetXmax(newtup->t_data);
		xid[1] = HeapTupleHeaderGetXmin(newtup->t_data);
		memcpy((char *) &xlhdr + hsize,
Bruce Momjian's avatar
Bruce Momjian committed
3805
			   (char *) xid,
3806
			   2 * sizeof(TransactionId));
3807
		hsize += 2 * sizeof(TransactionId);
3808
	}
Bruce Momjian's avatar
Bruce Momjian committed
3809

3810
	/*
3811 3812
	 * As with insert records, we need not store the rdata[2] segment if we
	 * decide to store the whole buffer instead.
3813
	 */
3814
	rdata[2].data = (char *) &xlhdr;
3815
	rdata[2].len = hsize;
3816 3817
	rdata[2].buffer = newbuf;
	rdata[2].buffer_std = true;
3818 3819
	rdata[2].next = &(rdata[3]);

3820
	/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
3821
	rdata[3].data = (char *) newtup->t_data + offsetof(HeapTupleHeaderData, t_bits);
3822
	rdata[3].len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits);
3823 3824
	rdata[3].buffer = newbuf;
	rdata[3].buffer_std = true;
3825 3826 3827 3828 3829 3830 3831 3832
	rdata[3].next = NULL;

	/* If new tuple is the single and first tuple on page... */
	if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber &&
		PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
	{
		info |= XLOG_HEAP_INIT_PAGE;
		rdata[2].buffer = rdata[3].buffer = InvalidBuffer;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3833
	}
3834 3835

	recptr = XLogInsert(RM_HEAP_ID, info, rdata);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3836

3837
	return recptr;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3838 3839
}

3840 3841 3842 3843
/*
 * Perform XLogInsert for a heap-move operation.  Caller must already
 * have modified the buffers and marked them dirty.
 */
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3844
XLogRecPtr
3845 3846
log_heap_move(Relation reln, Buffer oldbuf, ItemPointerData from,
			  Buffer newbuf, HeapTuple newtup)
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3847
{
3848
	return log_heap_update(reln, oldbuf, from, newbuf, newtup, true);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3849 3850
}

3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895
/*
 * Perform XLogInsert of a HEAP_NEWPAGE record to WAL. Caller is responsible
 * for writing the page to disk after calling this routine.
 *
 * Note: all current callers build pages in private memory and write them
 * directly to smgr, rather than using bufmgr.  Therefore there is no need
 * to pass a buffer ID to XLogInsert, nor to perform MarkBufferDirty within
 * the critical section.
 *
 * Note: the NEWPAGE log record is used for both heaps and indexes, so do
 * not do anything that assumes we are touching a heap.
 */
XLogRecPtr
log_newpage(RelFileNode *rnode, BlockNumber blkno, Page page)
{
	xl_heap_newpage xlrec;
	XLogRecPtr	recptr;
	XLogRecData rdata[2];

	/* NO ELOG(ERROR) from here till newpage op is logged */
	START_CRIT_SECTION();

	xlrec.node = *rnode;
	xlrec.blkno = blkno;

	rdata[0].data = (char *) &xlrec;
	rdata[0].len = SizeOfHeapNewpage;
	rdata[0].buffer = InvalidBuffer;
	rdata[0].next = &(rdata[1]);

	rdata[1].data = (char *) page;
	rdata[1].len = BLCKSZ;
	rdata[1].buffer = InvalidBuffer;
	rdata[1].next = NULL;

	recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_NEWPAGE, rdata);

	PageSetLSN(page, recptr);
	PageSetTLI(page, ThisTimeLineID);

	END_CRIT_SECTION();

	return recptr;
}

3896 3897 3898
/*
 * Handles CLEAN and CLEAN_MOVE record types
 */
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3899
static void
3900
heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record, bool clean_move)
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3901
{
3902 3903 3904 3905
	xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record);
	Relation	reln;
	Buffer		buffer;
	Page		page;
3906 3907 3908 3909 3910
	OffsetNumber *offnum;
	OffsetNumber *end;
	int nredirected;
	int ndead;
	int i;
3911

3912
	if (record->xl_info & XLR_BKP_BLOCK_1)
3913 3914
		return;

3915
	reln = XLogOpenRelation(xlrec->node);
3916
	buffer = XLogReadBuffer(reln, xlrec->block, false);
3917
	if (!BufferIsValid(buffer))
3918
		return;
3919
	page = (Page) BufferGetPage(buffer);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3920

3921 3922
	if (XLByteLE(lsn, PageGetLSN(page)))
	{
3923
		UnlockReleaseBuffer(buffer);
3924 3925 3926
		return;
	}

3927 3928 3929 3930
	nredirected = xlrec->nredirected;
	ndead = xlrec->ndead;
	offnum = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean);
	end = (OffsetNumber *) ((char *) xlrec + record->xl_len);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3931

3932 3933 3934 3935 3936 3937
	/* Update all redirected or moved line pointers */
	for (i = 0; i < nredirected; i++)
	{
		OffsetNumber fromoff = *offnum++;
		OffsetNumber tooff = *offnum++;
		ItemId	fromlp = PageGetItemId(page, fromoff);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3938

3939
		if (clean_move)
3940
		{
3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957
			/* Physically move the "to" item to the "from" slot */
			ItemId	tolp = PageGetItemId(page, tooff);
			HeapTupleHeader htup;

			*fromlp = *tolp;
			ItemIdSetUnused(tolp);

			/* We also have to clear the tuple's heap-only bit */
			Assert(ItemIdIsNormal(fromlp));
			htup = (HeapTupleHeader) PageGetItem(page, fromlp);
			Assert(HeapTupleHeaderIsHeapOnly(htup));
			HeapTupleHeaderClearHeapOnly(htup);
		}
		else
		{
			/* Just insert a REDIRECT link at fromoff */
			ItemIdSetRedirect(fromlp, tooff);
3958
		}
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3959 3960
	}

3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983
	/* Update all now-dead line pointers */
	for (i = 0; i < ndead; i++)
	{
		OffsetNumber off = *offnum++;
		ItemId	lp = PageGetItemId(page, off);

		ItemIdSetDead(lp);
	}

	/* Update all now-unused line pointers */
	while (offnum < end)
	{
		OffsetNumber off = *offnum++;
		ItemId	lp = PageGetItemId(page, off);

		ItemIdSetUnused(lp);
	}

	/*
	 * Finally, repair any fragmentation, and update the page's hint bit
	 * about whether it has free pointers.
	 */
	PageRepairFragmentation(page);
3984 3985

	PageSetLSN(page, lsn);
3986
	PageSetTLI(page, ThisTimeLineID);
3987 3988
	MarkBufferDirty(buffer);
	UnlockReleaseBuffer(buffer);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
3989 3990
}

3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039
static void
heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
{
	xl_heap_freeze *xlrec = (xl_heap_freeze *) XLogRecGetData(record);
	TransactionId cutoff_xid = xlrec->cutoff_xid;
	Relation	reln;
	Buffer		buffer;
	Page		page;

	if (record->xl_info & XLR_BKP_BLOCK_1)
		return;

	reln = XLogOpenRelation(xlrec->node);
	buffer = XLogReadBuffer(reln, xlrec->block, false);
	if (!BufferIsValid(buffer))
		return;
	page = (Page) BufferGetPage(buffer);

	if (XLByteLE(lsn, PageGetLSN(page)))
	{
		UnlockReleaseBuffer(buffer);
		return;
	}

	if (record->xl_len > SizeOfHeapFreeze)
	{
		OffsetNumber *offsets;
		OffsetNumber *offsets_end;

		offsets = (OffsetNumber *) ((char *) xlrec + SizeOfHeapFreeze);
		offsets_end = (OffsetNumber *) ((char *) xlrec + record->xl_len);

		while (offsets < offsets_end)
		{
			/* offsets[] entries are one-based */
			ItemId		lp = PageGetItemId(page, *offsets);
			HeapTupleHeader tuple = (HeapTupleHeader) PageGetItem(page, lp);

			(void) heap_freeze_tuple(tuple, cutoff_xid, InvalidBuffer);
			offsets++;
		}
	}

	PageSetLSN(page, lsn);
	PageSetTLI(page, ThisTimeLineID);
	MarkBufferDirty(buffer);
	UnlockReleaseBuffer(buffer);
}

4040
static void
4041
heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
4042 4043 4044 4045 4046 4047 4048
{
	xl_heap_newpage *xlrec = (xl_heap_newpage *) XLogRecGetData(record);
	Relation	reln;
	Buffer		buffer;
	Page		page;

	/*
4049 4050
	 * Note: the NEWPAGE log record is used for both heaps and indexes, so do
	 * not do anything that assumes we are touching a heap.
4051
	 */
4052
	reln = XLogOpenRelation(xlrec->node);
4053 4054
	buffer = XLogReadBuffer(reln, xlrec->blkno, true);
	Assert(BufferIsValid(buffer));
4055 4056 4057 4058 4059 4060
	page = (Page) BufferGetPage(buffer);

	Assert(record->xl_len == SizeOfHeapNewpage + BLCKSZ);
	memcpy(page, (char *) xlrec + SizeOfHeapNewpage, BLCKSZ);

	PageSetLSN(page, lsn);
4061
	PageSetTLI(page, ThisTimeLineID);
4062 4063
	MarkBufferDirty(buffer);
	UnlockReleaseBuffer(buffer);
4064 4065
}

4066
static void
4067
heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
4068
{
4069
	xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record);
4070
	Relation	reln;
4071 4072 4073 4074 4075
	Buffer		buffer;
	Page		page;
	OffsetNumber offnum;
	ItemId		lp = NULL;
	HeapTupleHeader htup;
4076

4077
	if (record->xl_info & XLR_BKP_BLOCK_1)
4078 4079
		return;

4080
	reln = XLogOpenRelation(xlrec->target.node);
4081 4082 4083
	buffer = XLogReadBuffer(reln,
							ItemPointerGetBlockNumber(&(xlrec->target.tid)),
							false);
4084
	if (!BufferIsValid(buffer))
4085
		return;
4086
	page = (Page) BufferGetPage(buffer);
4087

4088
	if (XLByteLE(lsn, PageGetLSN(page)))		/* changes are applied */
4089
	{
4090
		UnlockReleaseBuffer(buffer);
4091
		return;
4092 4093
	}

4094
	offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
4095 4096
	if (PageGetMaxOffsetNumber(page) >= offnum)
		lp = PageGetItemId(page, offnum);
4097

4098
	if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
4099
		elog(PANIC, "heap_delete_redo: invalid lp");
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
4100

4101
	htup = (HeapTupleHeader) PageGetItem(page, lp);
4102

4103 4104 4105 4106 4107
	htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
						  HEAP_XMAX_INVALID |
						  HEAP_XMAX_IS_MULTI |
						  HEAP_IS_LOCKED |
						  HEAP_MOVED);
4108
	HeapTupleHeaderClearHotUpdated(htup);
4109
	HeapTupleHeaderSetXmax(htup, record->xl_xid);
4110
	HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
4111 4112

	/* Mark the page as a candidate for pruning */
4113
	PageSetPrunable(page, record->xl_xid);
4114

4115 4116 4117 4118
	/* Make sure there is no forward chain link in t_ctid */
	htup->t_ctid = xlrec->target.tid;
	PageSetLSN(page, lsn);
	PageSetTLI(page, ThisTimeLineID);
4119 4120
	MarkBufferDirty(buffer);
	UnlockReleaseBuffer(buffer);
4121 4122
}

4123
static void
4124
heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
4125
{
4126
	xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record);
4127
	Relation	reln;
4128 4129 4130
	Buffer		buffer;
	Page		page;
	OffsetNumber offnum;
4131 4132 4133
	struct
	{
		HeapTupleHeaderData hdr;
4134
		char		data[MaxHeapTupleSize];
4135 4136 4137 4138
	}			tbuf;
	HeapTupleHeader htup;
	xl_heap_header xlhdr;
	uint32		newlen;
4139

4140
	if (record->xl_info & XLR_BKP_BLOCK_1)
4141 4142
		return;

4143
	reln = XLogOpenRelation(xlrec->target.node);
4144

4145
	if (record->xl_info & XLOG_HEAP_INIT_PAGE)
4146 4147
	{
		buffer = XLogReadBuffer(reln,
Bruce Momjian's avatar
Bruce Momjian committed
4148 4149
							 ItemPointerGetBlockNumber(&(xlrec->target.tid)),
								true);
4150 4151
		Assert(BufferIsValid(buffer));
		page = (Page) BufferGetPage(buffer);
4152

4153 4154 4155
		PageInit(page, BufferGetPageSize(buffer), 0);
	}
	else
4156
	{
4157
		buffer = XLogReadBuffer(reln,
Bruce Momjian's avatar
Bruce Momjian committed
4158 4159
							 ItemPointerGetBlockNumber(&(xlrec->target.tid)),
								false);
4160 4161 4162 4163
		if (!BufferIsValid(buffer))
			return;
		page = (Page) BufferGetPage(buffer);

Bruce Momjian's avatar
Bruce Momjian committed
4164
		if (XLByteLE(lsn, PageGetLSN(page)))	/* changes are applied */
4165
		{
4166
			UnlockReleaseBuffer(buffer);
4167 4168
			return;
		}
4169 4170
	}

4171 4172 4173 4174 4175
	offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
	if (PageGetMaxOffsetNumber(page) + 1 < offnum)
		elog(PANIC, "heap_insert_redo: invalid max offset number");

	newlen = record->xl_len - SizeOfHeapInsert - SizeOfHeapHeader;
4176
	Assert(newlen <= MaxHeapTupleSize);
4177 4178 4179 4180 4181 4182 4183 4184 4185 4186
	memcpy((char *) &xlhdr,
		   (char *) xlrec + SizeOfHeapInsert,
		   SizeOfHeapHeader);
	htup = &tbuf.hdr;
	MemSet((char *) htup, 0, sizeof(HeapTupleHeaderData));
	/* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
	memcpy((char *) htup + offsetof(HeapTupleHeaderData, t_bits),
		   (char *) xlrec + SizeOfHeapInsert + SizeOfHeapHeader,
		   newlen);
	newlen += offsetof(HeapTupleHeaderData, t_bits);
4187
	htup->t_infomask2 = xlhdr.t_infomask2;
4188 4189 4190 4191 4192 4193
	htup->t_infomask = xlhdr.t_infomask;
	htup->t_hoff = xlhdr.t_hoff;
	HeapTupleHeaderSetXmin(htup, record->xl_xid);
	HeapTupleHeaderSetCmin(htup, FirstCommandId);
	htup->t_ctid = xlrec->target.tid;

4194
	offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
4195 4196 4197 4198
	if (offnum == InvalidOffsetNumber)
		elog(PANIC, "heap_insert_redo: failed to add tuple");
	PageSetLSN(page, lsn);
	PageSetTLI(page, ThisTimeLineID);
4199 4200
	MarkBufferDirty(buffer);
	UnlockReleaseBuffer(buffer);
4201 4202
}

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
4203
/*
4204
 * Handles UPDATE, HOT_UPDATE & MOVE
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
4205
 */
4206
static void
4207
heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update)
4208
{
4209
	xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
4210
	Relation	reln = XLogOpenRelation(xlrec->target.node);
4211
	Buffer		buffer;
4212 4213
	bool		samepage = (ItemPointerGetBlockNumber(&(xlrec->newtid)) ==
							ItemPointerGetBlockNumber(&(xlrec->target.tid)));
4214 4215 4216 4217
	Page		page;
	OffsetNumber offnum;
	ItemId		lp = NULL;
	HeapTupleHeader htup;
4218 4219 4220
	struct
	{
		HeapTupleHeaderData hdr;
4221
		char		data[MaxHeapTupleSize];
4222 4223 4224 4225
	}			tbuf;
	xl_heap_header xlhdr;
	int			hsize;
	uint32		newlen;
4226

4227
	if (record->xl_info & XLR_BKP_BLOCK_1)
4228 4229 4230
	{
		if (samepage)
			return;				/* backup block covered both changes */
4231
		goto newt;
4232
	}
4233

4234 4235
	/* Deal with old tuple version */

4236 4237 4238
	buffer = XLogReadBuffer(reln,
							ItemPointerGetBlockNumber(&(xlrec->target.tid)),
							false);
4239
	if (!BufferIsValid(buffer))
4240
		goto newt;
4241 4242
	page = (Page) BufferGetPage(buffer);

4243
	if (XLByteLE(lsn, PageGetLSN(page)))		/* changes are applied */
4244
	{
4245
		UnlockReleaseBuffer(buffer);
4246 4247 4248
		if (samepage)
			return;
		goto newt;
4249 4250 4251
	}

	offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
4252 4253
	if (PageGetMaxOffsetNumber(page) >= offnum)
		lp = PageGetItemId(page, offnum);
4254

4255
	if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
4256
		elog(PANIC, "heap_update_redo: invalid lp");
4257

4258 4259
	htup = (HeapTupleHeader) PageGetItem(page, lp);

4260
	if (move)
4261
	{
4262 4263 4264 4265
		htup->t_infomask &= ~(HEAP_XMIN_COMMITTED |
							  HEAP_XMIN_INVALID |
							  HEAP_MOVED_IN);
		htup->t_infomask |= HEAP_MOVED_OFF;
4266
		HeapTupleHeaderClearHotUpdated(htup);
4267 4268 4269
		HeapTupleHeaderSetXvac(htup, record->xl_xid);
		/* Make sure there is no forward chain link in t_ctid */
		htup->t_ctid = xlrec->target.tid;
4270
	}
4271 4272 4273 4274 4275 4276 4277
	else
	{
		htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
							  HEAP_XMAX_INVALID |
							  HEAP_XMAX_IS_MULTI |
							  HEAP_IS_LOCKED |
							  HEAP_MOVED);
4278 4279 4280 4281
		if (hot_update)
			HeapTupleHeaderSetHotUpdated(htup);
		else
			HeapTupleHeaderClearHotUpdated(htup);
4282
		HeapTupleHeaderSetXmax(htup, record->xl_xid);
4283
		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
4284 4285 4286
		/* Set forward chain link in t_ctid */
		htup->t_ctid = xlrec->newtid;
	}
Bruce Momjian's avatar
Bruce Momjian committed
4287

4288
	/* Mark the page as a candidate for pruning */
4289
	PageSetPrunable(page, record->xl_xid);
4290

4291 4292 4293 4294
	/*
	 * this test is ugly, but necessary to avoid thinking that insert change
	 * is already applied
	 */
4295 4296 4297 4298
	if (samepage)
		goto newsame;
	PageSetLSN(page, lsn);
	PageSetTLI(page, ThisTimeLineID);
4299 4300
	MarkBufferDirty(buffer);
	UnlockReleaseBuffer(buffer);
4301 4302 4303 4304 4305

	/* Deal with new tuple */

newt:;

4306
	if (record->xl_info & XLR_BKP_BLOCK_2)
4307 4308
		return;

4309
	if (record->xl_info & XLOG_HEAP_INIT_PAGE)
4310 4311 4312 4313 4314 4315
	{
		buffer = XLogReadBuffer(reln,
								ItemPointerGetBlockNumber(&(xlrec->newtid)),
								true);
		Assert(BufferIsValid(buffer));
		page = (Page) BufferGetPage(buffer);
4316

4317 4318 4319
		PageInit(page, BufferGetPageSize(buffer), 0);
	}
	else
4320
	{
4321 4322 4323 4324 4325 4326 4327
		buffer = XLogReadBuffer(reln,
								ItemPointerGetBlockNumber(&(xlrec->newtid)),
								false);
		if (!BufferIsValid(buffer))
			return;
		page = (Page) BufferGetPage(buffer);

Bruce Momjian's avatar
Bruce Momjian committed
4328
		if (XLByteLE(lsn, PageGetLSN(page)))	/* changes are applied */
4329
		{
4330
			UnlockReleaseBuffer(buffer);
4331 4332
			return;
		}
4333 4334
	}

4335 4336
newsame:;

4337 4338 4339 4340 4341 4342 4343 4344 4345
	offnum = ItemPointerGetOffsetNumber(&(xlrec->newtid));
	if (PageGetMaxOffsetNumber(page) + 1 < offnum)
		elog(PANIC, "heap_update_redo: invalid max offset number");

	hsize = SizeOfHeapUpdate + SizeOfHeapHeader;
	if (move)
		hsize += (2 * sizeof(TransactionId));

	newlen = record->xl_len - hsize;
4346
	Assert(newlen <= MaxHeapTupleSize);
4347 4348 4349 4350 4351 4352 4353 4354 4355 4356
	memcpy((char *) &xlhdr,
		   (char *) xlrec + SizeOfHeapUpdate,
		   SizeOfHeapHeader);
	htup = &tbuf.hdr;
	MemSet((char *) htup, 0, sizeof(HeapTupleHeaderData));
	/* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
	memcpy((char *) htup + offsetof(HeapTupleHeaderData, t_bits),
		   (char *) xlrec + hsize,
		   newlen);
	newlen += offsetof(HeapTupleHeaderData, t_bits);
4357
	htup->t_infomask2 = xlhdr.t_infomask2;
4358 4359 4360 4361 4362
	htup->t_infomask = xlhdr.t_infomask;
	htup->t_hoff = xlhdr.t_hoff;

	if (move)
	{
4363
		TransactionId xid[2];	/* xmax, xmin */
4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378

		memcpy((char *) xid,
			   (char *) xlrec + SizeOfHeapUpdate + SizeOfHeapHeader,
			   2 * sizeof(TransactionId));
		HeapTupleHeaderSetXmin(htup, xid[1]);
		HeapTupleHeaderSetXmax(htup, xid[0]);
		HeapTupleHeaderSetXvac(htup, record->xl_xid);
	}
	else
	{
		HeapTupleHeaderSetXmin(htup, record->xl_xid);
		HeapTupleHeaderSetCmin(htup, FirstCommandId);
	}
	/* Make sure there is no forward chain link in t_ctid */
	htup->t_ctid = xlrec->newtid;
4379

4380
	offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
4381 4382 4383 4384
	if (offnum == InvalidOffsetNumber)
		elog(PANIC, "heap_update_redo: failed to add tuple");
	PageSetLSN(page, lsn);
	PageSetTLI(page, ThisTimeLineID);
4385 4386
	MarkBufferDirty(buffer);
	UnlockReleaseBuffer(buffer);
4387 4388
}

4389
static void
4390
heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record)
4391 4392 4393 4394 4395 4396 4397 4398 4399
{
	xl_heap_lock *xlrec = (xl_heap_lock *) XLogRecGetData(record);
	Relation	reln;
	Buffer		buffer;
	Page		page;
	OffsetNumber offnum;
	ItemId		lp = NULL;
	HeapTupleHeader htup;

4400
	if (record->xl_info & XLR_BKP_BLOCK_1)
4401 4402
		return;

4403
	reln = XLogOpenRelation(xlrec->target.node);
4404 4405 4406
	buffer = XLogReadBuffer(reln,
							ItemPointerGetBlockNumber(&(xlrec->target.tid)),
							false);
4407
	if (!BufferIsValid(buffer))
4408
		return;
4409 4410
	page = (Page) BufferGetPage(buffer);

4411
	if (XLByteLE(lsn, PageGetLSN(page)))		/* changes are applied */
4412
	{
4413
		UnlockReleaseBuffer(buffer);
4414
		return;
4415 4416 4417 4418 4419 4420
	}

	offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
	if (PageGetMaxOffsetNumber(page) >= offnum)
		lp = PageGetItemId(page, offnum);

4421
	if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
4422
		elog(PANIC, "heap_lock_redo: invalid lp");
4423 4424 4425

	htup = (HeapTupleHeader) PageGetItem(page, lp);

4426 4427 4428 4429 4430
	htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
						  HEAP_XMAX_INVALID |
						  HEAP_XMAX_IS_MULTI |
						  HEAP_IS_LOCKED |
						  HEAP_MOVED);
4431 4432 4433 4434 4435 4436
	if (xlrec->xid_is_mxact)
		htup->t_infomask |= HEAP_XMAX_IS_MULTI;
	if (xlrec->shared_lock)
		htup->t_infomask |= HEAP_XMAX_SHARED_LOCK;
	else
		htup->t_infomask |= HEAP_XMAX_EXCL_LOCK;
4437
	HeapTupleHeaderClearHotUpdated(htup);
4438
	HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
4439
	HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
4440 4441 4442 4443
	/* Make sure there is no forward chain link in t_ctid */
	htup->t_ctid = xlrec->target.tid;
	PageSetLSN(page, lsn);
	PageSetTLI(page, ThisTimeLineID);
4444 4445
	MarkBufferDirty(buffer);
	UnlockReleaseBuffer(buffer);
4446 4447
}

4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480
static void
heap_xlog_inplace(XLogRecPtr lsn, XLogRecord *record)
{
	xl_heap_inplace *xlrec = (xl_heap_inplace *) XLogRecGetData(record);
	Relation	reln = XLogOpenRelation(xlrec->target.node);
	Buffer		buffer;
	Page		page;
	OffsetNumber offnum;
	ItemId		lp = NULL;
	HeapTupleHeader htup;
	uint32		oldlen;
	uint32		newlen;

	if (record->xl_info & XLR_BKP_BLOCK_1)
		return;

	buffer = XLogReadBuffer(reln,
							ItemPointerGetBlockNumber(&(xlrec->target.tid)),
							false);
	if (!BufferIsValid(buffer))
		return;
	page = (Page) BufferGetPage(buffer);

	if (XLByteLE(lsn, PageGetLSN(page)))		/* changes are applied */
	{
		UnlockReleaseBuffer(buffer);
		return;
	}

	offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
	if (PageGetMaxOffsetNumber(page) >= offnum)
		lp = PageGetItemId(page, offnum);

4481
	if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500
		elog(PANIC, "heap_inplace_redo: invalid lp");

	htup = (HeapTupleHeader) PageGetItem(page, lp);

	oldlen = ItemIdGetLength(lp) - htup->t_hoff;
	newlen = record->xl_len - SizeOfHeapInplace;
	if (oldlen != newlen)
		elog(PANIC, "heap_inplace_redo: wrong tuple length");

	memcpy((char *) htup + htup->t_hoff,
		   (char *) xlrec + SizeOfHeapInplace,
		   newlen);

	PageSetLSN(page, lsn);
	PageSetTLI(page, ThisTimeLineID);
	MarkBufferDirty(buffer);
	UnlockReleaseBuffer(buffer);
}

4501 4502
void
heap_redo(XLogRecPtr lsn, XLogRecord *record)
4503
{
4504
	uint8		info = record->xl_info & ~XLR_INFO_MASK;
4505

4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534
	switch (info & XLOG_HEAP_OPMASK)
	{
		case XLOG_HEAP_INSERT:
			heap_xlog_insert(lsn, record);
			break;
		case XLOG_HEAP_DELETE:
			heap_xlog_delete(lsn, record);
			break;
		case XLOG_HEAP_UPDATE:
			heap_xlog_update(lsn, record, false, false);
			break;
		case XLOG_HEAP_MOVE:
			heap_xlog_update(lsn, record, true, false);
			break;
		case XLOG_HEAP_HOT_UPDATE:
			heap_xlog_update(lsn, record, false, true);
			break;
		case XLOG_HEAP_NEWPAGE:
			heap_xlog_newpage(lsn, record);
			break;
		case XLOG_HEAP_LOCK:
			heap_xlog_lock(lsn, record);
			break;
		case XLOG_HEAP_INPLACE:
			heap_xlog_inplace(lsn, record);
			break;
		default:
			elog(PANIC, "heap_redo: unknown op code %u", info);
	}
4535 4536
}

4537 4538 4539 4540 4541
void
heap2_redo(XLogRecPtr lsn, XLogRecord *record)
{
	uint8		info = record->xl_info & ~XLR_INFO_MASK;

4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555
	switch (info & XLOG_HEAP_OPMASK)
	{
		case XLOG_HEAP2_FREEZE:
			heap_xlog_freeze(lsn, record);
			break;
		case XLOG_HEAP2_CLEAN:
			heap_xlog_clean(lsn, record, false);
			break;
		case XLOG_HEAP2_CLEAN_MOVE:
			heap_xlog_clean(lsn, record, true);
			break;
		default:
			elog(PANIC, "heap2_redo: unknown op code %u", info);
	}
4556 4557
}

Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
4558
static void
4559
out_target(StringInfo buf, xl_heaptid *target)
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
4560
{
4561
	appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u",
Bruce Momjian's avatar
Bruce Momjian committed
4562 4563 4564
			 target->node.spcNode, target->node.dbNode, target->node.relNode,
					 ItemPointerGetBlockNumber(&(target->tid)),
					 ItemPointerGetOffsetNumber(&(target->tid)));
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
4565
}
4566

Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
4567
void
4568
heap_desc(StringInfo buf, uint8 xl_info, char *rec)
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
4569
{
4570
	uint8		info = xl_info & ~XLR_INFO_MASK;
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
4571

4572
	info &= XLOG_HEAP_OPMASK;
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
4573 4574
	if (info == XLOG_HEAP_INSERT)
	{
4575 4576
		xl_heap_insert *xlrec = (xl_heap_insert *) rec;

4577 4578 4579 4580
		if (xl_info & XLOG_HEAP_INIT_PAGE)
			appendStringInfo(buf, "insert(init): ");
		else
			appendStringInfo(buf, "insert: ");
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
4581 4582 4583 4584
		out_target(buf, &(xlrec->target));
	}
	else if (info == XLOG_HEAP_DELETE)
	{
4585 4586
		xl_heap_delete *xlrec = (xl_heap_delete *) rec;

4587
		appendStringInfo(buf, "delete: ");
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
4588 4589
		out_target(buf, &(xlrec->target));
	}
4590
	else if (info == XLOG_HEAP_UPDATE)
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
4591
	{
4592 4593
		xl_heap_update *xlrec = (xl_heap_update *) rec;

4594 4595 4596
		if (xl_info & XLOG_HEAP_INIT_PAGE)
			appendStringInfo(buf, "update(init): ");
		else
4597
			appendStringInfo(buf, "update: ");
4598 4599
		out_target(buf, &(xlrec->target));
		appendStringInfo(buf, "; new %u/%u",
Bruce Momjian's avatar
Bruce Momjian committed
4600 4601
						 ItemPointerGetBlockNumber(&(xlrec->newtid)),
						 ItemPointerGetOffsetNumber(&(xlrec->newtid)));
4602 4603 4604 4605 4606 4607 4608
	}
	else if (info == XLOG_HEAP_MOVE)
	{
		xl_heap_update *xlrec = (xl_heap_update *) rec;

		if (xl_info & XLOG_HEAP_INIT_PAGE)
			appendStringInfo(buf, "move(init): ");
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
4609
		else
4610
			appendStringInfo(buf, "move: ");
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
4611
		out_target(buf, &(xlrec->target));
4612
		appendStringInfo(buf, "; new %u/%u",
Bruce Momjian's avatar
Bruce Momjian committed
4613 4614
						 ItemPointerGetBlockNumber(&(xlrec->newtid)),
						 ItemPointerGetOffsetNumber(&(xlrec->newtid)));
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
4615
	}
4616
	else if (info == XLOG_HEAP_HOT_UPDATE)
4617
	{
4618
		xl_heap_update *xlrec = (xl_heap_update *) rec;
4619

4620 4621 4622 4623 4624 4625 4626 4627
		if (xl_info & XLOG_HEAP_INIT_PAGE) /* can this case happen? */
			appendStringInfo(buf, "hot_update(init): ");
		else
			appendStringInfo(buf, "hot_update: ");
		out_target(buf, &(xlrec->target));
		appendStringInfo(buf, "; new %u/%u",
						 ItemPointerGetBlockNumber(&(xlrec->newtid)),
						 ItemPointerGetOffsetNumber(&(xlrec->newtid)));
4628
	}
4629 4630 4631 4632
	else if (info == XLOG_HEAP_NEWPAGE)
	{
		xl_heap_newpage *xlrec = (xl_heap_newpage *) rec;

4633
		appendStringInfo(buf, "newpage: rel %u/%u/%u; blk %u",
Bruce Momjian's avatar
Bruce Momjian committed
4634 4635
						 xlrec->node.spcNode, xlrec->node.dbNode,
						 xlrec->node.relNode, xlrec->blkno);
4636
	}
4637 4638 4639 4640 4641
	else if (info == XLOG_HEAP_LOCK)
	{
		xl_heap_lock *xlrec = (xl_heap_lock *) rec;

		if (xlrec->shared_lock)
4642
			appendStringInfo(buf, "shared_lock: ");
4643
		else
4644
			appendStringInfo(buf, "exclusive_lock: ");
4645
		if (xlrec->xid_is_mxact)
4646
			appendStringInfo(buf, "mxid ");
4647
		else
4648 4649
			appendStringInfo(buf, "xid ");
		appendStringInfo(buf, "%u ", xlrec->locking_xid);
4650 4651
		out_target(buf, &(xlrec->target));
	}
4652 4653 4654 4655 4656 4657 4658
	else if (info == XLOG_HEAP_INPLACE)
	{
		xl_heap_inplace *xlrec = (xl_heap_inplace *) rec;

		appendStringInfo(buf, "inplace: ");
		out_target(buf, &(xlrec->target));
	}
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
4659
	else
4660
		appendStringInfo(buf, "UNKNOWN");
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
4661
}
4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677

void
heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
{
	uint8		info = xl_info & ~XLR_INFO_MASK;

	info &= XLOG_HEAP_OPMASK;
	if (info == XLOG_HEAP2_FREEZE)
	{
		xl_heap_freeze *xlrec = (xl_heap_freeze *) rec;

		appendStringInfo(buf, "freeze: rel %u/%u/%u; blk %u; cutoff %u",
						 xlrec->node.spcNode, xlrec->node.dbNode,
						 xlrec->node.relNode, xlrec->block,
						 xlrec->cutoff_xid);
	}
4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693
	else if (info == XLOG_HEAP2_CLEAN)
	{
		xl_heap_clean *xlrec = (xl_heap_clean *) rec;

		appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u",
						 xlrec->node.spcNode, xlrec->node.dbNode,
						 xlrec->node.relNode, xlrec->block);
	}
	else if (info == XLOG_HEAP2_CLEAN_MOVE)
	{
		xl_heap_clean *xlrec = (xl_heap_clean *) rec;

		appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u",
						 xlrec->node.spcNode, xlrec->node.dbNode,
						 xlrec->node.relNode, xlrec->block);
	}
4694 4695 4696
	else
		appendStringInfo(buf, "UNKNOWN");
}
4697

4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710
/*
 *	heap_sync		- sync a heap, for use when no WAL has been written
 *
 * This forces the heap contents (including TOAST heap if any) down to disk.
 * If we skipped using WAL, and it's not a temp relation, we must force the
 * relation down to disk before it's safe to commit the transaction.  This
 * requires writing out any dirty buffers and then doing a forced fsync.
 *
 * Indexes are not touched.  (Currently, index operations associated with
 * the commands that use this are WAL-logged and so do not need fsync.
 * That behavior might change someday, but in any case it's likely that
 * any fsync decisions required would be per-index and hence not appropriate
 * to be done here.)
4711 4712 4713 4714
 */
void
heap_sync(Relation rel)
{
4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725
	/* temp tables never need fsync */
	if (rel->rd_istemp)
		return;

	/* main heap */
	FlushRelationBuffers(rel);
	/* FlushRelationBuffers will have opened rd_smgr */
	smgrimmedsync(rel->rd_smgr);

	/* toast heap, if any */
	if (OidIsValid(rel->rd_rel->reltoastrelid))
4726
	{
4727 4728 4729 4730 4731 4732
		Relation		toastrel;

		toastrel = heap_open(rel->rd_rel->reltoastrelid, AccessShareLock);
		FlushRelationBuffers(toastrel);
		smgrimmedsync(toastrel->rd_smgr);
		heap_close(toastrel, AccessShareLock);
4733 4734
	}
}