nbtpage.c 13.7 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * nbtpage.c
4 5
 *	  BTree-specific page management code for the Postgres btree access
 *	  method.
6
 *
7
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
Bruce Momjian's avatar
Add:  
Bruce Momjian committed
8
 * Portions Copyright (c) 1994, Regents of the University of California
9 10 11
 *
 *
 * IDENTIFICATION
12
 *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.49 2001/01/29 07:28:17 vadim Exp $
13
 *
14 15 16 17 18 19
 *	NOTES
 *	   Postgres btree pages look like ordinary relation pages.	The opaque
 *	   data at high addresses includes pointers to left and right siblings
 *	   and flag data describing page state.  The first page in a btree, page
 *	   zero, is special -- it stores meta-information describing the tree.
 *	   Pages one and higher store the actual tree data.
20 21 22
 *
 *-------------------------------------------------------------------------
 */
23
#include "postgres.h"
24

25 26
#include <time.h>

Bruce Momjian's avatar
Bruce Momjian committed
27 28
#include "access/nbtree.h"
#include "miscadmin.h"
29 30
#include "storage/lmgr.h"

31 32
extern bool FixBTree;	/* comments in nbtree.c */
extern Buffer _bt_fixroot(Relation rel, Buffer oldrootbuf, bool release);
Bruce Momjian's avatar
Bruce Momjian committed
33

34
/*
35 36 37 38 39 40 41 42 43 44 45
 *	We use high-concurrency locking on btrees.	There are two cases in
 *	which we don't do locking.  One is when we're building the btree.
 *	Since the creating transaction has not committed, no one can see
 *	the index, and there's no reason to share locks.  The second case
 *	is when we're just starting up the database system.  We use some
 *	special-purpose initialization code in the relation cache manager
 *	(see utils/cache/relcache.c) to allow us to do indexed scans on
 *	the system catalogs before we'd normally be able to.  This happens
 *	before the lock table is fully initialized, so we can't use it.
 *	Strictly speaking, this violates 2pl, but we don't do 2pl on the
 *	system catalogs anyway, so I declare this to be okay.
46 47
 */

48
#define USELOCKING		(!BuildingBtree && !IsInitProcessingMode())
49 50

/*
51
 *	_bt_metapinit() -- Initialize the metadata page of a btree.
52 53 54 55
 */
void
_bt_metapinit(Relation rel)
{
56 57 58 59 60
	Buffer		buf;
	Page		pg;
	int			nblocks;
	BTMetaPageData metad;
	BTPageOpaque op;
61 62 63

	/* can't be sharing this with anyone, now... */
	if (USELOCKING)
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
64
		LockRelation(rel, AccessExclusiveLock);
65 66 67

	if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0)
	{
68
		elog(ERROR, "Cannot initialize non-empty btree %s",
69 70 71 72 73 74 75 76 77 78 79
			 RelationGetRelationName(rel));
	}

	buf = ReadBuffer(rel, P_NEW);
	pg = BufferGetPage(buf);
	_bt_pageinit(pg, BufferGetPageSize(buf));

	metad.btm_magic = BTREE_MAGIC;
	metad.btm_version = BTREE_VERSION;
	metad.btm_root = P_NONE;
	metad.btm_level = 0;
80
	memcpy((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad));
81 82 83 84 85 86 87 88

	op = (BTPageOpaque) PageGetSpecialPointer(pg);
	op->btpo_flags = BTP_META;

	WriteBuffer(buf);

	/* all done */
	if (USELOCKING)
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
89
		UnlockRelation(rel, AccessExclusiveLock);
90 91 92
}

/*
93
 *	_bt_getroot() -- Get the root page of the btree.
94
 *
95 96 97 98 99
 *		Since the root page can move around the btree file, we have to read
 *		its location from the metadata page, and then read the root page
 *		itself.  If no root page exists yet, we have to create one.  The
 *		standard class of race conditions exists here; I think I covered
 *		them all in the Hopi Indian rain dance of lock requests below.
100
 *
101 102 103 104 105 106 107 108 109
 *		The access type parameter (BT_READ or BT_WRITE) controls whether
 *		a new root page will be created or not.  If access = BT_READ,
 *		and no root page exists, we just return InvalidBuffer.  For
 *		BT_WRITE, we try to create the root page if it doesn't exist.
 *		NOTE that the returned root page will have only a read lock set
 *		on it even if access = BT_WRITE!
 *
 *		On successful return, the root page is pinned and read-locked.
 *		The metadata page is not locked or pinned on exit.
110 111 112 113
 */
Buffer
_bt_getroot(Relation rel, int access)
{
114 115 116 117
	Buffer		metabuf;
	Page		metapg;
	BTPageOpaque metaopaque;
	Buffer		rootbuf;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
118
	Page		rootpage;
119 120
	BTPageOpaque rootopaque;
	BlockNumber rootblkno;
121 122 123
	BTMetaPageData *metad;

	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
124 125 126
	metapg = BufferGetPage(metabuf);
	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
	metad = BTPageGetMeta(metapg);
127

128 129
	if (!(metaopaque->btpo_flags & BTP_META) ||
		metad->btm_magic != BTREE_MAGIC)
130
		elog(ERROR, "Index %s is not a btree",
131 132 133
			 RelationGetRelationName(rel));

	if (metad->btm_version != BTREE_VERSION)
134
		elog(ERROR, "Version mismatch on %s: version %d file, version %d code",
135 136 137 138 139 140
			 RelationGetRelationName(rel),
			 metad->btm_version, BTREE_VERSION);

	/* if no root page initialized yet, do it */
	if (metad->btm_root == P_NONE)
	{
141 142 143 144 145 146
		/* If access = BT_READ, caller doesn't want us to create root yet */
		if (access == BT_READ)
		{
			_bt_relbuf(rel, metabuf, BT_READ);
			return InvalidBuffer;
		}
147

148 149 150
		/* trade in our read lock for a write lock */
		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
		LockBuffer(metabuf, BT_WRITE);
151 152 153 154

		/*
		 * Race condition:	if someone else initialized the metadata
		 * between the time we released the read lock and acquired the
155
		 * write lock, above, we must avoid doing it again.
156 157 158 159 160 161 162
		 */
		if (metad->btm_root == P_NONE)
		{

			/*
			 * Get, initialize, write, and leave a lock of the appropriate
			 * type on the new root page.  Since this is the first page in
163
			 * the tree, it's a leaf as well as the root.
164 165 166
			 */
			rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
			rootblkno = BufferGetBlockNumber(rootbuf);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
167 168 169
			rootpage = BufferGetPage(rootbuf);

			/* NO ELOG(ERROR) till meta is updated */
170
			START_CRIT_SECTION();
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
171

172 173 174
			metad->btm_root = rootblkno;
			metad->btm_level = 1;

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
175 176 177 178 179 180
			_bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
			rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
			rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT);

			/* XLOG stuff */
			{
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
181 182
				xl_btree_newroot	xlrec;
				XLogRecPtr			recptr;
183
				XLogRecData			rdata;
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
184

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
185
				xlrec.node = rel->rd_node;
186
				xlrec.level = 1;
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
187
				BlockIdSet(&(xlrec.rootblk), rootblkno);
188 189 190 191
				rdata.buffer = InvalidBuffer;
				rdata.data = (char*)&xlrec;
				rdata.len = SizeOfBtreeNewroot;
				rdata.next = NULL;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
192

193 194
				recptr = XLogInsert(RM_BTREE_ID,
							XLOG_BTREE_NEWROOT|XLOG_BTREE_LEAF, &rdata);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
195 196 197

				PageSetLSN(rootpage, recptr);
				PageSetSUI(rootpage, ThisStartUpID);
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
198 199
				PageSetLSN(metapg, recptr);
				PageSetSUI(metapg, ThisStartUpID);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
200
			}
201

202
			END_CRIT_SECTION();
203

204 205
			_bt_wrtnorelbuf(rel, rootbuf);

206 207 208
			/* swap write lock for read lock */
			LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
			LockBuffer(rootbuf, BT_READ);
209

210
			/* okay, metadata is correct, write and release it */
211 212 213 214 215 216 217 218 219 220
			_bt_wrtbuf(rel, metabuf);
		}
		else
		{
			/*
			 * Metadata initialized by someone else.  In order to
			 * guarantee no deadlocks, we have to release the metadata
			 * page and start all over again.
			 */
			_bt_relbuf(rel, metabuf, BT_WRITE);
221
			return _bt_getroot(rel, access);
222
		}
223
	}
224 225
	else
	{
226
		rootblkno = metad->btm_root;
227
		_bt_relbuf(rel, metabuf, BT_READ);		/* done with the meta page */
228

229
		rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
230 231 232 233 234
	}

	/*
	 * Race condition:	If the root page split between the time we looked
	 * at the metadata page and got the root buffer, then we got the wrong
235
	 * buffer.  Release it and try again.
236
	 */
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
237 238
	rootpage = BufferGetPage(rootbuf);
	rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
239

240 241
	if (! P_ISROOT(rootopaque))
	{
242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
		/*
		 * It happened, but if root page splitter failed to create
		 * new root page then we'll go in loop trying to call
		 * _bt_getroot again and again.
		 */
		if (FixBTree)
		{
			Buffer	newrootbuf;

check_parent:;
			if (rootopaque->btpo_parent == BTREE_METAPAGE)	/* unupdated! */
			{
				LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
				LockBuffer(rootbuf, BT_WRITE);

				/* handle concurrent fix of root page */
				if (rootopaque->btpo_parent == BTREE_METAPAGE)	/* unupdated! */
				{
260
					elog(NOTICE, "bt_getroot: fixing root page");
261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
					newrootbuf = _bt_fixroot(rel, rootbuf, true);
					LockBuffer(newrootbuf, BUFFER_LOCK_UNLOCK);
					LockBuffer(newrootbuf, BT_READ);
					rootbuf = newrootbuf;
					rootpage = BufferGetPage(rootbuf);
					rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
					/* New root might be splitted while changing lock */
					if (P_ISROOT(rootopaque))
						return(rootbuf);
					/* rootbuf is read locked */
					goto check_parent;
				}
				else	/* someone else already fixed root */
				{
					LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
					LockBuffer(rootbuf, BT_READ);
				}
			}
			/*
			 * Ok, here we have old root page with btpo_parent pointing
			 * to upper level - check parent page because of there is
			 * good chance that parent is root page.
			 */
			newrootbuf = _bt_getbuf(rel, rootopaque->btpo_parent, BT_READ);
			_bt_relbuf(rel, rootbuf, BT_READ);
			rootbuf = newrootbuf;
			rootpage = BufferGetPage(rootbuf);
			rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
			if (P_ISROOT(rootopaque))
				return(rootbuf);
			/* no luck -:( */
		}

		/* try again */
295
		_bt_relbuf(rel, rootbuf, BT_READ);
296
		return _bt_getroot(rel, access);
297 298 299 300 301 302 303
	}

	/*
	 * By here, we have a correct lock on the root block, its reference
	 * count is correct, and we have no lock set on the metadata page.
	 * Return the root block.
	 */
304
	return rootbuf;
305 306 307
}

/*
308
 *	_bt_getbuf() -- Get a buffer by block number for read or write.
309
 *
310
 *		When this routine returns, the appropriate lock is set on the
311 312
 *		requested buffer and its reference count has been incremented
 *		(ie, the buffer is "locked and pinned").
313 314 315 316
 */
Buffer
_bt_getbuf(Relation rel, BlockNumber blkno, int access)
{
317
	Buffer		buf;
318 319 320

	if (blkno != P_NEW)
	{
321
		/* Read an existing block of the relation */
322
		buf = ReadBuffer(rel, blkno);
323
		LockBuffer(buf, access);
324
	}
325
	else
326
	{
327
		Page		page;
328

329
		/*
330 331 332
		 * Extend the relation by one page.
		 *
		 * Extend bufmgr code is unclean and so we have to use extra locking
333
		 * here.
334 335
		 */
		LockPage(rel, 0, ExclusiveLock);
336
		buf = ReadBuffer(rel, blkno);
337
		LockBuffer(buf, access);
338
		UnlockPage(rel, 0, ExclusiveLock);
339 340

		/* Initialize the new page before returning it */
341 342 343 344 345
		page = BufferGetPage(buf);
		_bt_pageinit(page, BufferGetPageSize(buf));
	}

	/* ref count and lock type are correct */
346
	return buf;
347 348 349
}

/*
350
 *	_bt_relbuf() -- release a locked buffer.
351 352
 *
 * Lock and pin (refcount) are both dropped.
353 354 355 356
 */
void
_bt_relbuf(Relation rel, Buffer buf, int access)
{
357
	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
358
	ReleaseBuffer(buf);
359 360 361
}

/*
362
 *	_bt_wrtbuf() -- write a btree page to disk.
363
 *
364 365 366 367 368 369 370 371 372
 *		This routine releases the lock held on the buffer and our refcount
 *		for it.  It is an error to call _bt_wrtbuf() without a write lock
 *		and a pin on the buffer.
 *
 * NOTE: actually, the buffer manager just marks the shared buffer page
 * dirty here, the real I/O happens later.  Since we can't persuade the
 * Unix kernel to schedule disk writes in a particular order, there's not
 * much point in worrying about this.  The most we can say is that all the
 * writes will occur before commit.
373 374 375 376
 */
void
_bt_wrtbuf(Relation rel, Buffer buf)
{
377
	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
378
	WriteBuffer(buf);
379 380 381
}

/*
382 383
 *	_bt_wrtnorelbuf() -- write a btree page to disk, but do not release
 *						 our reference or lock.
384
 *
385
 *		It is an error to call _bt_wrtnorelbuf() without a write lock
386 387 388
 *		and a pin on the buffer.
 *
 * See above NOTE.
389 390 391 392
 */
void
_bt_wrtnorelbuf(Relation rel, Buffer buf)
{
393
	WriteNoReleaseBuffer(buf);
394 395 396
}

/*
397
 *	_bt_pageinit() -- Initialize a new page.
398 399 400 401
 */
void
_bt_pageinit(Page page, Size size)
{
402 403

	/*
404
	 * Cargo_cult programming -- don't really need this to be zero, but
405 406 407 408
	 * creating new pages is an infrequent occurrence and it makes me feel
	 * good when I know they're empty.
	 */

Bruce Momjian's avatar
Bruce Momjian committed
409
	MemSet(page, 0, size);
410 411

	PageInit(page, size, sizeof(BTPageOpaqueData));
Bruce Momjian's avatar
Bruce Momjian committed
412
	((BTPageOpaque) PageGetSpecialPointer(page))->btpo_parent =
413
		InvalidBlockNumber;
414 415 416
}

/*
417
 *	_bt_metaproot() -- Change the root page of the btree.
418
 *
419 420 421 422
 *		Lehman and Yao require that the root page move around in order to
 *		guarantee deadlock-free short-term, fine-granularity locking.  When
 *		we split the root page, we record the new parent in the metadata page
 *		for the relation.  This routine does the work.
423
 *
424
 *		No direct preconditions, but if you don't have the write lock on
425 426
 *		at least the old root page when you call this, you're making a big
 *		mistake.  On exit, metapage data is correct and we no longer have
427
 *		a pin or lock on the metapage.
428 429
 */
void
430
_bt_metaproot(Relation rel, BlockNumber rootbknum, int level)
431
{
432 433 434
	Buffer		metabuf;
	Page		metap;
	BTPageOpaque metaopaque;
435 436 437 438 439 440 441 442
	BTMetaPageData *metad;

	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
	metap = BufferGetPage(metabuf);
	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap);
	Assert(metaopaque->btpo_flags & BTP_META);
	metad = BTPageGetMeta(metap);
	metad->btm_root = rootbknum;
443
	if (level == 0)				/* called from _do_insert */
444 445 446 447
		metad->btm_level += 1;
	else
		metad->btm_level = level;		/* called from btsort */
	_bt_wrtbuf(rel, metabuf);
448 449 450
}

/*
451
 * Delete an item from a btree.  It had better be a leaf item...
452 453 454 455
 */
void
_bt_pagedel(Relation rel, ItemPointer tid)
{
456 457 458 459
	Buffer		buf;
	Page		page;
	BlockNumber blkno;
	OffsetNumber offno;
460 461 462 463 464 465 466

	blkno = ItemPointerGetBlockNumber(tid);
	offno = ItemPointerGetOffsetNumber(tid);

	buf = _bt_getbuf(rel, blkno, BT_WRITE);
	page = BufferGetPage(buf);

467
	START_CRIT_SECTION();
468 469
	PageIndexTupleDelete(page, offno);
	/* XLOG stuff */
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
470 471
	{
		xl_btree_delete	xlrec;
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
472
		XLogRecPtr		recptr;
473
		XLogRecData		rdata[2];
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
474

Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
475 476
		xlrec.target.node = rel->rd_node;
		xlrec.target.tid = *tid;
477 478 479 480 481 482 483 484 485 486 487
		rdata[0].buffer = InvalidBuffer;
		rdata[0].data = (char*)&xlrec;
		rdata[0].len = SizeOfBtreeDelete;
		rdata[0].next = &(rdata[1]);

		rdata[1].buffer = buf;
		rdata[1].data = NULL;
		rdata[1].len = 0;
		rdata[1].next = NULL;

		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
488 489 490 491

		PageSetLSN(page, recptr);
		PageSetSUI(page, ThisStartUpID);
	}
492
	END_CRIT_SECTION();
493 494 495

	/* write the buffer and release the lock */
	_bt_wrtbuf(rel, buf);
496
}