smgr.c 15.7 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * smgr.c
4
 *	  public interface routines to storage manager switch.
5
 *
6 7
 *	  All file system operations in POSTGRES dispatch through these
 *	  routines.
8
 *
9
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
Bruce Momjian's avatar
Add:  
Bruce Momjian committed
10
 * Portions Copyright (c) 1994, Regents of the University of California
11 12 13
 *
 *
 * IDENTIFICATION
14
 *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.53 2001/09/29 04:02:25 tgl Exp $
15 16 17 18 19
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

20
#include "storage/bufmgr.h"
21
#include "storage/freespace.h"
22
#include "storage/ipc.h"
Marc G. Fournier's avatar
Marc G. Fournier committed
23
#include "storage/smgr.h"
24 25
#include "utils/memutils.h"

26

27
static void smgrshutdown(void);
28 29 30

typedef struct f_smgr
{
31 32
	int			(*smgr_init) (void);	/* may be NULL */
	int			(*smgr_shutdown) (void);		/* may be NULL */
33
	int			(*smgr_create) (Relation reln);
34
	int			(*smgr_unlink) (RelFileNode rnode);
35 36
	int			(*smgr_extend) (Relation reln, BlockNumber blocknum,
								char *buffer);
37 38 39
	int			(*smgr_open) (Relation reln);
	int			(*smgr_close) (Relation reln);
	int			(*smgr_read) (Relation reln, BlockNumber blocknum,
40
							  char *buffer);
41
	int			(*smgr_write) (Relation reln, BlockNumber blocknum,
42
							   char *buffer);
43
	int			(*smgr_flush) (Relation reln, BlockNumber blocknum,
44
							   char *buffer);
45
	int			(*smgr_blindwrt) (RelFileNode rnode, BlockNumber blkno,
46
								  char *buffer, bool dofsync);
47
	int			(*smgr_markdirty) (Relation reln, BlockNumber blkno);
48
	int			(*smgr_blindmarkdirty) (RelFileNode, BlockNumber blkno);
49 50
	BlockNumber	(*smgr_nblocks) (Relation reln);
	BlockNumber	(*smgr_truncate) (Relation reln, BlockNumber nblocks);
51 52
	int			(*smgr_commit) (void);	/* may be NULL */
	int			(*smgr_abort) (void);	/* may be NULL */
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
53
	int			(*smgr_sync) (void);
54
} f_smgr;
55 56

/*
57 58
 *	The weird placement of commas in this init block is to keep the compiler
 *	happy, regardless of what storage managers we have (or don't have).
59 60
 */

61
static f_smgr smgrsw[] = {
62

63 64
	/* magnetic disk */
	{mdinit, NULL, mdcreate, mdunlink, mdextend, mdopen, mdclose,
65
		mdread, mdwrite, mdflush, mdblindwrt, mdmarkdirty, mdblindmarkdirty,
66
		mdnblocks, mdtruncate, mdcommit, mdabort, mdsync
67
	},
68

69
#ifdef STABLE_MEMORY_STORAGE
70 71
	/* main memory */
	{mminit, mmshutdown, mmcreate, mmunlink, mmextend, mmopen, mmclose,
72 73
		mmread, mmwrite, mmflush, mmblindwrt, mmmarkdirty, mmblindmarkdirty,
	mmnblocks, NULL, mmcommit, mmabort},
74

75
#endif
76 77 78
};

/*
79 80 81 82
 *	This array records which storage managers are write-once, and which
 *	support overwrite.	A 'true' entry means that the storage manager is
 *	write-once.  In the best of all possible worlds, there would be no
 *	write-once storage managers.
83 84
 */

85
#ifdef NOT_USED
86
static bool smgrwo[] = {
87
	false,						/* magnetic disk */
88
#ifdef STABLE_MEMORY_STORAGE
89
	false,						/* main memory */
90
#endif
91
};
Bruce Momjian's avatar
Bruce Momjian committed
92

93 94
#endif

95
static int	NSmgr = lengthof(smgrsw);
96

97 98 99 100 101
/*
 * We keep a list of all relations (represented as RelFileNode values)
 * that have been created or deleted in the current transaction.  When
 * a relation is created, we create the physical file immediately, but
 * remember it so that we can delete the file again if the current
102
 * transaction is aborted.	Conversely, a deletion request is NOT
103 104 105 106 107 108 109 110 111 112 113
 * executed immediately, but is just entered in the list.  When and if
 * the transaction commits, we can delete the physical file.
 *
 * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
 * unbetimes.  It'd probably be OK to keep it in TopTransactionContext,
 * but I'm being paranoid.
 */

typedef struct PendingRelDelete
{
	RelFileNode relnode;		/* relation that may need to be deleted */
114 115 116
	int16		which;			/* which storage manager? */
	bool		atCommit;		/* T=delete at commit; F=delete at abort */
	struct PendingRelDelete *next;		/* linked-list link */
117 118
} PendingRelDelete;

119
static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
120 121


122
/*
123 124
 *	smgrinit(), smgrshutdown() -- Initialize or shut down all storage
 *								  managers.
125 126 127 128 129
 *
 */
int
smgrinit()
{
130
	int			i;
131 132 133 134 135 136

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_init)
		{
			if ((*(smgrsw[i].smgr_init)) () == SM_FAIL)
137
				elog(FATAL, "initialization failed on %s: %m",
138
					 DatumGetCString(DirectFunctionCall1(smgrout,
139
													 Int16GetDatum(i))));
140
		}
141 142
	}

143
	/* register the shutdown proc */
144
	on_proc_exit(smgrshutdown, 0);
145

146
	return SM_SUCCESS;
147 148
}

149
static void
150
smgrshutdown(void)
151
{
152
	int			i;
153 154 155 156 157 158

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_shutdown)
		{
			if ((*(smgrsw[i].smgr_shutdown)) () == SM_FAIL)
159
				elog(FATAL, "shutdown failed on %s: %m",
160
					 DatumGetCString(DirectFunctionCall1(smgrout,
161
													 Int16GetDatum(i))));
162
		}
163 164 165 166
	}
}

/*
167
 *	smgrcreate() -- Create a new relation.
168
 *
169 170
 *		This routine takes a reldesc, creates the relation on the appropriate
 *		device, and returns a file descriptor for it.
171 172 173 174
 */
int
smgrcreate(int16 which, Relation reln)
{
175
	int			fd;
176
	PendingRelDelete *pending;
177

178
	if ((fd = (*(smgrsw[which].smgr_create)) (reln)) < 0)
179
		elog(ERROR, "cannot create %s: %m", RelationGetRelationName(reln));
180

181 182 183 184 185 186 187 188 189
	/* Add the relation to the list of stuff to delete at abort */
	pending = (PendingRelDelete *)
		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
	pending->relnode = reln->rd_node;
	pending->which = which;
	pending->atCommit = false;	/* delete if abort */
	pending->next = pendingDeletes;
	pendingDeletes = pending;

190
	return fd;
191 192 193
}

/*
194
 *	smgrunlink() -- Unlink a relation.
195
 *
196 197
 *		The relation is removed from the store.  Actually, we just remember
 *		that we want to do this at transaction commit.
198 199 200 201
 */
int
smgrunlink(int16 which, Relation reln)
{
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
	PendingRelDelete *pending;

	/* Make sure the file is closed */
	if (reln->rd_fd >= 0)
		smgrclose(which, reln);

	/* Add the relation to the list of stuff to delete at commit */
	pending = (PendingRelDelete *)
		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
	pending->relnode = reln->rd_node;
	pending->which = which;
	pending->atCommit = true;	/* delete if commit */
	pending->next = pendingDeletes;
	pendingDeletes = pending;

	/*
	 * NOTE: if the relation was created in this transaction, it will now
219 220 221 222 223
	 * be present in the pending-delete list twice, once with atCommit
	 * true and once with atCommit false.  Hence, it will be physically
	 * deleted at end of xact in either case (and the other entry will be
	 * ignored by smgrDoPendingDeletes, so no error will occur).  We could
	 * instead remove the existing list entry and delete the physical file
224 225
	 * immediately, but for now I'll keep the logic simple.
	 */
226

227
	return SM_SUCCESS;
228 229 230
}

/*
231
 *	smgrextend() -- Add a new block to a file.
232
 *
233 234 235 236 237
 *		The semantics are basically the same as smgrwrite(): write at the
 *		specified position.  However, we are expecting to extend the
 *		relation (ie, blocknum is the current EOF), and so in case of
 *		failure we clean up by truncating.
 *
238 239
 *		Returns SM_SUCCESS on success; aborts the current transaction on
 *		failure.
240 241
 */
int
242
smgrextend(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
243
{
244
	int			status;
245

246
	status = (*(smgrsw[which].smgr_extend)) (reln, blocknum, buffer);
247

248
	if (status == SM_FAIL)
249
		elog(ERROR, "cannot extend %s: %m.\n\tCheck free disk space.",
250
			 RelationGetRelationName(reln));
251

252
	return status;
253 254 255
}

/*
256
 *	smgropen() -- Open a relation using a particular storage manager.
257
 *
258 259 260
 *		Returns the fd for the open relation on success.
 *
 *		On failure, returns -1 if failOK, else aborts the transaction.
261 262
 */
int
263
smgropen(int16 which, Relation reln, bool failOK)
264
{
265
	int			fd;
266

Hiroshi Inoue's avatar
Hiroshi Inoue committed
267 268
	if (reln->rd_rel->relkind == RELKIND_VIEW)
		return -1;
269
	if ((fd = (*(smgrsw[which].smgr_open)) (reln)) < 0)
270
		if (!failOK)
271
			elog(ERROR, "cannot open %s: %m", RelationGetRelationName(reln));
272

273
	return fd;
274 275 276
}

/*
277
 *	smgrclose() -- Close a relation.
278
 *
279
 *		Returns SM_SUCCESS on success, aborts on failure.
280 281 282 283
 */
int
smgrclose(int16 which, Relation reln)
{
284
	if ((*(smgrsw[which].smgr_close)) (reln) == SM_FAIL)
285
		elog(ERROR, "cannot close %s: %m", RelationGetRelationName(reln));
286

287
	return SM_SUCCESS;
288 289 290
}

/*
291 292
 *	smgrread() -- read a particular block from a relation into the supplied
 *				  buffer.
293
 *
294 295 296 297 298
 *		This routine is called from the buffer manager in order to
 *		instantiate pages in the shared buffer cache.  All storage managers
 *		return pages in the format that POSTGRES expects.  This routine
 *		dispatches the read.  On success, it returns SM_SUCCESS.  On failure,
 *		the current transaction is aborted.
299 300 301 302
 */
int
smgrread(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
{
303
	int			status;
304

305
	status = (*(smgrsw[which].smgr_read)) (reln, blocknum, buffer);
306

307
	if (status == SM_FAIL)
308
		elog(ERROR, "cannot read block %d of %s: %m",
309
			 blocknum, RelationGetRelationName(reln));
310

311
	return status;
312 313 314
}

/*
315
 *	smgrwrite() -- Write the supplied buffer out.
316
 *
317 318 319 320
 *		This is not a synchronous write -- the interface for that is
 *		smgrflush().  The buffer is written out via the appropriate
 *		storage manager.  This routine returns SM_SUCCESS or aborts
 *		the current transaction.
321 322 323 324
 */
int
smgrwrite(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
{
325
	int			status;
326

327
	status = (*(smgrsw[which].smgr_write)) (reln, blocknum, buffer);
328

329
	if (status == SM_FAIL)
330
		elog(ERROR, "cannot write block %d of %s: %m",
331
			 blocknum, RelationGetRelationName(reln));
332

333
	return status;
334 335 336
}

/*
337
 *	smgrflush() -- A synchronous smgrwrite().
338 339 340 341
 */
int
smgrflush(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
{
342
	int			status;
343

344
	status = (*(smgrsw[which].smgr_flush)) (reln, blocknum, buffer);
345

346
	if (status == SM_FAIL)
347
		elog(ERROR, "cannot flush block %d of %s to stable store: %m",
348
			 blocknum, RelationGetRelationName(reln));
349

350
	return status;
351 352 353
}

/*
354
 *	smgrblindwrt() -- Write a page out blind.
355
 *
356 357 358 359 360 361 362
 *		In some cases, we may find a page in the buffer cache that we
 *		can't make a reldesc for.  This happens, for example, when we
 *		want to reuse a dirty page that was written by a transaction
 *		that has not yet committed, which created a new relation.  In
 *		this case, the buffer manager will call smgrblindwrt() with
 *		the name and OID of the database and the relation to which the
 *		buffer belongs.  Every storage manager must be able to force
363
 *		this page down to stable storage in this circumstance.	The
364
 *		write should be synchronous if dofsync is true.
365
 */
366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
int
smgrblindwrt(int16 which,
			 RelFileNode rnode,
			 BlockNumber blkno,
			 char *buffer,
			 bool dofsync)
{
	int			status;

	status = (*(smgrsw[which].smgr_blindwrt)) (rnode, blkno, buffer, dofsync);

	if (status == SM_FAIL)
		elog(ERROR, "cannot write block %d of %u/%u blind: %m",
			 blkno, rnode.tblNode, rnode.relNode);

	return status;
}

384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403
/*
 *	smgrmarkdirty() -- Mark a page dirty (needs fsync).
 *
 *		Mark the specified page as needing to be fsync'd before commit.
 *		Ordinarily, the storage manager will do this implicitly during
 *		smgrwrite().  However, the buffer manager may discover that some
 *		other backend has written a buffer that we dirtied in the current
 *		transaction.  In that case, we still need to fsync the file to be
 *		sure the page is down to disk before we commit.
 */
int
smgrmarkdirty(int16 which,
			  Relation reln,
			  BlockNumber blkno)
{
	int			status;

	status = (*(smgrsw[which].smgr_markdirty)) (reln, blkno);

	if (status == SM_FAIL)
404
		elog(ERROR, "cannot mark block %d of %s: %m",
405 406 407 408 409 410 411 412 413 414
			 blkno, RelationGetRelationName(reln));

	return status;
}

/*
 *	smgrblindmarkdirty() -- Mark a page dirty, "blind".
 *
 *		Just like smgrmarkdirty, except we don't have a reldesc.
 */
415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
int
smgrblindmarkdirty(int16 which,
				   RelFileNode rnode,
				   BlockNumber blkno)
{
	int			status;

	status = (*(smgrsw[which].smgr_blindmarkdirty)) (rnode, blkno);

	if (status == SM_FAIL)
		elog(ERROR, "cannot mark block %d of %u/%u blind: %m",
			 blkno, rnode.tblNode, rnode.relNode);

	return status;
}

431
/*
432 433
 *	smgrnblocks() -- Calculate the number of POSTGRES blocks in the
 *					 supplied relation.
434
 *
435 436
 *		Returns the number of blocks on success, aborts the current
 *		transaction on failure.
437
 */
438
BlockNumber
439 440
smgrnblocks(int16 which, Relation reln)
{
441 442 443 444 445 446 447 448 449 450 451 452 453 454
	BlockNumber		nblocks;

	nblocks = (*(smgrsw[which].smgr_nblocks)) (reln);
	/*
	 * NOTE: if a relation ever did grow to 2^32-1 blocks, this code would
	 * fail --- but that's a good thing, because it would stop us from
	 * extending the rel another block and having a block whose number
	 * actually is InvalidBlockNumber.
	 */
	if (nblocks == InvalidBlockNumber)
		elog(ERROR, "cannot count blocks for %s: %m",
			 RelationGetRelationName(reln));

	return nblocks;
455 456
}

457
/*
458 459
 *	smgrtruncate() -- Truncate supplied relation to a specified number
 *						of blocks
460
 *
461 462
 *		Returns the number of blocks on success, aborts the current
 *		transaction on failure.
463
 */
464 465
BlockNumber
smgrtruncate(int16 which, Relation reln, BlockNumber nblocks)
466
{
467
	BlockNumber		newblks;
468 469 470 471

	newblks = nblocks;
	if (smgrsw[which].smgr_truncate)
	{
472
		/*
473 474 475
		 * Tell the free space map to forget anything it may have stored
		 * for the about-to-be-deleted blocks.  We want to be sure it won't
		 * return bogus block numbers later on.
476
		 */
477 478 479
		MultiRecordFreeSpace(&reln->rd_node,
							 nblocks, MaxBlockNumber,
							 0, NULL, NULL);
480 481 482 483

		newblks = (*(smgrsw[which].smgr_truncate)) (reln, nblocks);
		if (newblks == InvalidBlockNumber)
			elog(ERROR, "cannot truncate %s to %u blocks: %m",
484
				 RelationGetRelationName(reln), nblocks);
485 486
	}

487
	return newblks;
488 489
}

490 491 492 493 494 495 496 497 498 499 500 501 502 503 504
/*
 * smgrDoPendingDeletes() -- take care of relation deletes at end of xact.
 */
int
smgrDoPendingDeletes(bool isCommit)
{
	while (pendingDeletes != NULL)
	{
		PendingRelDelete *pending = pendingDeletes;

		pendingDeletes = pending->next;
		if (pending->atCommit == isCommit)
		{
			/*
			 * Get rid of any leftover buffers for the rel (shouldn't be
505 506
			 * any in the commit case, but there can be in the abort
			 * case).
507 508
			 */
			DropRelFileNodeBuffers(pending->relnode);
509

510 511 512 513 514 515 516
			/*
			 * Tell the free space map to forget this relation.  It won't
			 * be accessed any more anyway, but we may as well recycle the
			 * map space quickly.
			 */
			FreeSpaceMapForgetRel(&pending->relnode);

517 518 519 520
			/*
			 * And delete the physical files.
			 *
			 * Note: we treat deletion failure as a NOTICE, not an error,
521 522
			 * because we've already decided to commit or abort the
			 * current xact.
523 524 525 526 527 528 529 530 531 532 533
			 */
			if ((*(smgrsw[pending->which].smgr_unlink)) (pending->relnode) == SM_FAIL)
				elog(NOTICE, "cannot unlink %u/%u: %m",
					 pending->relnode.tblNode, pending->relnode.relNode);
		}
		pfree(pending);
	}

	return SM_SUCCESS;
}

534
/*
535 536
 *	smgrcommit(), smgrabort() -- Commit or abort changes made during the
 *								 current transaction.
537 538 539 540
 */
int
smgrcommit()
{
541
	int			i;
542 543 544 545 546 547

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_commit)
		{
			if ((*(smgrsw[i].smgr_commit)) () == SM_FAIL)
548
				elog(FATAL, "transaction commit failed on %s: %m",
549
					 DatumGetCString(DirectFunctionCall1(smgrout,
550
													 Int16GetDatum(i))));
551
		}
552 553
	}

554
	return SM_SUCCESS;
555 556 557 558 559
}

int
smgrabort()
{
560
	int			i;
561 562 563 564 565 566

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_abort)
		{
			if ((*(smgrsw[i].smgr_abort)) () == SM_FAIL)
567
				elog(FATAL, "transaction abort failed on %s: %m",
568
					 DatumGetCString(DirectFunctionCall1(smgrout,
569
													 Int16GetDatum(i))));
570
		}
571 572
	}

573
	return SM_SUCCESS;
574
}
575

Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
576 577 578 579 580 581 582 583 584 585 586 587
int
smgrsync()
{
	int			i;

	for (i = 0; i < NSmgr; i++)
	{
		if (smgrsw[i].smgr_sync)
		{
			if ((*(smgrsw[i].smgr_sync)) () == SM_FAIL)
				elog(STOP, "storage sync failed on %s: %m",
					 DatumGetCString(DirectFunctionCall1(smgrout,
588
													 Int16GetDatum(i))));
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
589 590 591 592 593 594
		}
	}

	return SM_SUCCESS;
}

595
#ifdef NOT_USED
596 597 598
bool
smgriswo(int16 smgrno)
{
599
	if (smgrno < 0 || smgrno >= NSmgr)
600
		elog(ERROR, "illegal storage manager number %d", smgrno);
601

602
	return smgrwo[smgrno];
603
}
Bruce Momjian's avatar
Bruce Momjian committed
604

605
#endif
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
606 607 608 609 610 611 612 613 614 615

void
smgr_redo(XLogRecPtr lsn, XLogRecord *record)
{
}

void
smgr_undo(XLogRecPtr lsn, XLogRecord *record)
{
}
616

Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
617
void
618
smgr_desc(char *buf, uint8 xl_info, char *rec)
Vadim B. Mikheev's avatar
WAL  
Vadim B. Mikheev committed
619 620
{
}