md.c 25 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * md.c
4
 *	  This code manages relations that reside on magnetic disk.
5
 *
Bruce Momjian's avatar
Add:  
Bruce Momjian committed
6 7
 * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
 * Portions Copyright (c) 1994, Regents of the University of California
8 9 10
 *
 *
 * IDENTIFICATION
11
 *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.66 2000/04/10 23:41:51 tgl Exp $
12 13 14
 *
 *-------------------------------------------------------------------------
 */
Bruce Momjian's avatar
Bruce Momjian committed
15
#include <unistd.h>
Bruce Momjian's avatar
Bruce Momjian committed
16
#include <fcntl.h>
17 18 19 20
#include <sys/file.h>

#include "postgres.h"

21
#include "catalog/catalog.h"
Bruce Momjian's avatar
Bruce Momjian committed
22
#include "miscadmin.h"
Bruce Momjian's avatar
Bruce Momjian committed
23
#include "storage/smgr.h"
24
#include "utils/inval.h"	/* ImmediateSharedRelationCacheInvalidate() */
25 26 27 28

#undef DIAGNOSTIC

/*
29 30 31
 *	The magnetic disk storage manager keeps track of open file descriptors
 *	in its own descriptor pool.  This happens for two reasons.	First, at
 *	transaction boundaries, we walk the list of descriptors and flush
32 33 34 35 36
 *	anything that we've dirtied in the current transaction.  Second, we want
 *	to support relations larger than the OS' file size limit (often 2GBytes).
 *	In order to do that, we break relations up into chunks of < 2GBytes
 *	and store one chunk in each of several files that represent the relation.
 *	See the BLCKSZ and RELSEG_SIZE configuration constants in include/config.h.
37 38 39 40 41 42 43 44 45
 *
 *	The file descriptor stored in the relation cache (see RelationGetFile())
 *	is actually an index into the Md_fdvec array.  -1 indicates not open.
 *
 *	When a relation is broken into multiple chunks, only the first chunk
 *	has its own entry in the Md_fdvec array; the remaining chunks have
 *	palloc'd MdfdVec objects that are chained onto the first chunk via the
 *	mdfd_chain links.  All chunks except the last MUST have size exactly
 *	equal to RELSEG_SIZE blocks --- see mdnblocks() and mdtruncate().
46 47
 */

48 49
typedef struct _MdfdVec
{
50
	int			mdfd_vfd;		/* fd number in vfd pool */
51
	int			mdfd_flags;		/* fd status flags */
52 53 54 55

/* these are the assigned bits in mdfd_flags: */
#define MDFD_FREE		(1 << 0)/* unused entry */

56 57
	int			mdfd_lstbcnt;	/* most recent block count */
	int			mdfd_nextFree;	/* next free vector */
58
#ifndef LET_OS_MANAGE_FILESIZE
59
	struct _MdfdVec *mdfd_chain;/* for large relations */
60
#endif
61
} MdfdVec;
62

63
static int	Nfds = 100;			/* initial/current size of Md_fdvec array */
64
static MdfdVec *Md_fdvec = (MdfdVec *) NULL;
65 66 67
static int	Md_Free = -1;		/* head of freelist of unused fdvec entries */
static int	CurFd = 0;			/* first never-used fdvec index */
static MemoryContext MdCxt;		/* context for all my allocations */
68 69

/* routines declared here */
70
static void mdclose_fd(int fd);
71
static int _mdfd_getrelnfd(Relation reln);
72
static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);
73
static MdfdVec *_mdfd_getseg(Relation reln, int blkno);
74 75
static int _mdfd_blind_getseg(char *dbname, char *relname,
							  Oid dbid, Oid relid, int blkno);
76 77
static int	_fdvec_alloc(void);
static void _fdvec_free(int);
78 79 80
static BlockNumber _mdnblocks(File file, Size blcksz);

/*
81
 *	mdinit() -- Initialize private state for magnetic disk storage manager.
82
 *
83 84 85 86 87
 *		We keep a private table of all file descriptors.  Whenever we do
 *		a write to one, we mark it dirty in our table.	Whenever we force
 *		changes to disk, we mark the file descriptor clean.  At transaction
 *		commit, we force changes to disk for all dirty file descriptors.
 *		This routine allocates and initializes the table.
88
 *
89
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
90 91 92 93
 */
int
mdinit()
{
94 95
	MemoryContext oldcxt;
	int			i;
96

97 98
	MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr");
	if (MdCxt == (MemoryContext) NULL)
99
		return SM_FAIL;
100

101 102 103
	oldcxt = MemoryContextSwitchTo(MdCxt);
	Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
	MemoryContextSwitchTo(oldcxt);
104

105
	if (Md_fdvec == (MdfdVec *) NULL)
106
		return SM_FAIL;
107

Bruce Momjian's avatar
Bruce Momjian committed
108
	MemSet(Md_fdvec, 0, Nfds * sizeof(MdfdVec));
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
109

110 111 112 113 114 115 116 117
	/* Set free list */
	for (i = 0; i < Nfds; i++)
	{
		Md_fdvec[i].mdfd_nextFree = i + 1;
		Md_fdvec[i].mdfd_flags = MDFD_FREE;
	}
	Md_Free = 0;
	Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
118

119
	return SM_SUCCESS;
120 121 122 123 124
}

int
mdcreate(Relation reln)
{
125 126 127
	int			fd,
				vfd;
	char	   *path;
128

129
	Assert(reln->rd_unlinked && reln->rd_fd < 0);
130
	path = relpath(RelationGetPhysicalRelationName(reln));
131
#ifndef __CYGWIN32__
132
	fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);
133 134 135
#else
	fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | O_BINARY, 0600);
#endif
136 137

	/*
138 139 140
	 * During bootstrap processing, we skip that check, because pg_time,
	 * pg_variable, and pg_log get created before their .bki file entries
	 * are processed.
141
	 *
142 143 144 145 146 147
	 * For cataloged relations,pg_class is guaranteed to have an unique
	 * record with the same relname by the unique index.
	 * So we are able to reuse existent files for new catloged relations.
	 * Currently we reuse them in the following cases.
	 * 1. they are empty.
	 * 2. they are used for Index relations and their size == BLCKSZ * 2.
148 149 150 151
	 */

	if (fd < 0)
	{
152 153
		if (!IsBootstrapProcessingMode() &&
			reln->rd_rel->relkind == RELKIND_UNCATALOGED)
154
			return -1;
155

156
#ifndef __CYGWIN32__
157
		fd = FileNameOpenFile(path, O_RDWR, 0600);
158
#else
159
		fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600);
160
#endif
161
		if (fd < 0)
162
			return -1;
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
		if (!IsBootstrapProcessingMode())
		{
			bool	reuse = false;	
			int	len = FileSeek(fd, 0L, SEEK_END);

			if (len == 0)
				reuse = true;
			else if (reln->rd_rel->relkind == RELKIND_INDEX &&
				 len == BLCKSZ * 2)
				reuse = true;
			if (!reuse)
			{
				FileClose(fd);
				return -1;
			}
		}
179
	}
180
	reln->rd_unlinked = false;
181 182 183

	vfd = _fdvec_alloc();
	if (vfd < 0)
184
		return -1;
185 186 187

	Md_fdvec[vfd].mdfd_vfd = fd;
	Md_fdvec[vfd].mdfd_flags = (uint16) 0;
188
#ifndef LET_OS_MANAGE_FILESIZE
189
	Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
190
#endif
191 192
	Md_fdvec[vfd].mdfd_lstbcnt = 0;

193 194
	pfree(path);

195
	return vfd;
196 197 198
}

/*
199
 *	mdunlink() -- Unlink a relation.
200 201 202 203
 */
int
mdunlink(Relation reln)
{
204
	int			nblocks;
205
	int			fd;
206
	MdfdVec    *v;
207
	MemoryContext oldcxt;
208

209 210 211 212 213
	/* If the relation is already unlinked,we have nothing to do
	 * any more.
	 */
	if (reln->rd_unlinked && reln->rd_fd < 0)
		return SM_SUCCESS;
214
	/*
215 216
	 * Force all segments of the relation to be opened, so that we
	 * won't miss deleting any of them.
217
	 */
218
	nblocks = mdnblocks(reln);
219

220 221 222 223 224 225 226 227 228
	/*
	 * Clean out the mdfd vector, letting fd.c unlink the physical files.
	 *
	 * NOTE: We truncate the file(s) before deleting 'em, because if other
	 * backends are holding the files open, the unlink will fail on some
	 * platforms (think Microsoft).  Better a zero-size file gets left around
	 * than a big file.  Those other backends will be forced to close the
	 * relation by cache invalidation, but that probably hasn't happened yet.
	 */
229
	fd = RelationGetFile(reln);
230 231 232
	if (fd < 0)					/* should not happen */
		elog(ERROR, "mdunlink: mdnblocks didn't open relation");

233 234 235
	Md_fdvec[fd].mdfd_flags = (uint16) 0;

	oldcxt = MemoryContextSwitchTo(MdCxt);
236
#ifndef LET_OS_MANAGE_FILESIZE
237 238
	for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
	{
239 240
		MdfdVec    *ov = v;
		FileTruncate(v->mdfd_vfd, 0);
241 242 243 244 245 246
		FileUnlink(v->mdfd_vfd);
		v = v->mdfd_chain;
		if (ov != &Md_fdvec[fd])
			pfree(ov);
	}
	Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
247 248
#else
	v = &Md_fdvec[fd];
249 250
	FileTruncate(v->mdfd_vfd, 0);
	FileUnlink(v->mdfd_vfd);
251
#endif
252
	MemoryContextSwitchTo(oldcxt);
253

254 255
	_fdvec_free(fd);

256
	/* be sure to mark relation closed && unlinked */
257
	reln->rd_fd = -1;
258
	reln->rd_unlinked = true;
259

260
	return SM_SUCCESS;
261 262 263
}

/*
264
 *	mdextend() -- Add a block to the specified relation.
265
 *
266 267
 *		This routine returns SM_FAIL or SM_SUCCESS, with errno set as
 *		appropriate.
268 269 270 271
 */
int
mdextend(Relation reln, char *buffer)
{
272
	long		pos, nbytes;
273 274
	int			nblocks;
	MdfdVec    *v;
275

276
	nblocks = mdnblocks(reln);
277
	v = _mdfd_getseg(reln, nblocks);
278

279
	if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0)
280
		return SM_FAIL;
281

282 283 284 285 286 287 288 289 290 291 292 293 294 295
	if (pos % BLCKSZ != 0) /* the last block is incomplete */
	{
		pos -= pos % BLCKSZ;
		if (FileSeek(v->mdfd_vfd, pos, SEEK_SET) < 0)
			return SM_FAIL;
	}

	if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
	{
		if (nbytes > 0)
		{
			FileTruncate(v->mdfd_vfd, pos);
			FileSeek(v->mdfd_vfd, pos, SEEK_SET);
		}
296
		return SM_FAIL;
297
	}
298

299
	/* try to keep the last block count current, though it's just a hint */
300
#ifndef LET_OS_MANAGE_FILESIZE
301 302
	if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
		v->mdfd_lstbcnt = RELSEG_SIZE;
303 304

#ifdef DIAGNOSTIC
305 306 307
	if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE
		|| v->mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big!");
308
#endif
309 310 311
#else
	v->mdfd_lstbcnt = ++nblocks;
#endif
312

313
	return SM_SUCCESS;
314 315 316
}

/*
317
 *	mdopen() -- Open the specified relation.
318 319 320 321
 */
int
mdopen(Relation reln)
{
322 323 324
	char	   *path;
	int			fd;
	int			vfd;
325

326
	Assert(reln->rd_fd < 0);
327
	path = relpath(RelationGetPhysicalRelationName(reln));
328

329
#ifndef __CYGWIN32__
330
	fd = FileNameOpenFile(path, O_RDWR, 0600);
331 332 333
#else
	fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600);
#endif
334

335
	if (fd < 0)
336 337 338 339
	{
		/* in bootstrap mode, accept mdopen as substitute for mdcreate */
		if (IsBootstrapProcessingMode())
		{
340
#ifndef __CYGWIN32__
341
			fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);
342
#else
343
			fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | O_BINARY, 0600);
344
#endif
345 346 347
		}
		if (fd < 0)
		{
348 349 350 351
			elog(NOTICE, "mdopen: couldn't open %s: %m", path);
			/* mark relation closed and unlinked */
			reln->rd_fd = -1;
			reln->rd_unlinked = true;
352 353 354
			return -1;
		}
	}
355
	reln->rd_unlinked = false;
356

357 358
	vfd = _fdvec_alloc();
	if (vfd < 0)
359
		return -1;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
360

361 362 363
	Md_fdvec[vfd].mdfd_vfd = fd;
	Md_fdvec[vfd].mdfd_flags = (uint16) 0;
	Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
364 365
#ifndef LET_OS_MANAGE_FILESIZE
	Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
366 367

#ifdef DIAGNOSTIC
368 369
	if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big on relopen!");
370
#endif
371 372
#endif

373 374
	pfree(path);

375
	return vfd;
376 377 378
}

/*
379
 *	mdclose() -- Close the specified relation, if it isn't closed already.
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
380
 *
381 382
 *		AND FREE fd vector! It may be re-used for other relation!
 *		reln should be flushed from cache after closing !..
383
 *
384
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
385 386 387 388
 */
int
mdclose(Relation reln)
{
389
	int			fd;
390

391
	fd = RelationGetFile(reln);
392 393
	if (fd < 0)
		return SM_SUCCESS;		/* already closed, so no work */
394

395 396 397 398 399 400 401 402 403 404 405 406 407
	mdclose_fd(fd);

	reln->rd_fd = -1;

	return SM_SUCCESS;
}

static void
mdclose_fd(int fd)
{
	MdfdVec    *v;
	MemoryContext oldcxt;

408
	oldcxt = MemoryContextSwitchTo(MdCxt);
409
#ifndef LET_OS_MANAGE_FILESIZE
410
	for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
411
	{
412 413
		MdfdVec    *ov = v;

414 415 416 417 418
		/* if not closed already */
		if (v->mdfd_vfd >= 0)
		{
			/*
			 * We sync the file descriptor so that we don't need to reopen
419 420 421
			 * it at transaction commit to force changes to disk.  (This
			 * is not really optional, because we are about to forget that
			 * the file even exists...)
422 423 424 425 426 427 428 429 430
			 */
			FileSync(v->mdfd_vfd);
			FileClose(v->mdfd_vfd);
		}
		/* Now free vector */
		v = v->mdfd_chain;
		if (ov != &Md_fdvec[fd])
			pfree(ov);
	}
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
431

432
	Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
433 434 435 436 437 438 439 440
#else
	v = &Md_fdvec[fd];
	if (v != (MdfdVec *) NULL)
	{
		if (v->mdfd_vfd >= 0)
		{
			/*
			 * We sync the file descriptor so that we don't need to reopen
441 442 443
			 * it at transaction commit to force changes to disk.  (This
			 * is not really optional, because we are about to forget that
			 * the file even exists...)
444 445 446 447 448 449 450
			 */
			FileSync(v->mdfd_vfd);
			FileClose(v->mdfd_vfd);
		}
	}
#endif
	MemoryContextSwitchTo(oldcxt);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
451

452
	_fdvec_free(fd);
453 454 455
}

/*
456
 *	mdread() -- Read the specified block from a relation.
457
 *
458
 *		Returns SM_SUCCESS or SM_FAIL.
459 460 461 462
 */
int
mdread(Relation reln, BlockNumber blocknum, char *buffer)
{
463 464 465 466
	int			status;
	long		seekpos;
	int			nbytes;
	MdfdVec    *v;
467

468
	v = _mdfd_getseg(reln, blocknum);
469

470
#ifndef LET_OS_MANAGE_FILESIZE
471
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
472 473

#ifdef DIAGNOSTIC
474 475
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
476
#endif
477 478 479
#else
	seekpos = (long) (BLCKSZ * (blocknum));
#endif
480

481
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
482
		return SM_FAIL;
483

484 485 486 487
	status = SM_SUCCESS;
	if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
	{
		if (nbytes == 0)
Bruce Momjian's avatar
Bruce Momjian committed
488
			MemSet(buffer, 0, BLCKSZ);
489 490
		else if (blocknum == 0 && nbytes > 0 && mdnblocks(reln) == 0)
			MemSet(buffer, 0, BLCKSZ);
491 492
		else
			status = SM_FAIL;
493 494
	}

495
	return status;
496 497 498
}

/*
499
 *	mdwrite() -- Write the supplied block at the appropriate location.
500
 *
501
 *		Returns SM_SUCCESS or SM_FAIL.
502 503 504 505
 */
int
mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
{
506 507 508
	int			status;
	long		seekpos;
	MdfdVec    *v;
509

510
	v = _mdfd_getseg(reln, blocknum);
511

512
#ifndef LET_OS_MANAGE_FILESIZE
513
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
514
#ifdef DIAGNOSTIC
515 516
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
517
#endif
518 519 520
#else
	seekpos = (long) (BLCKSZ * (blocknum));
#endif
521

522
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
523
		return SM_FAIL;
524

525 526 527
	status = SM_SUCCESS;
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
		status = SM_FAIL;
528

529
	return status;
530 531 532
}

/*
533
 *	mdflush() -- Synchronously write a block to disk.
534
 *
535 536
 *		This is exactly like mdwrite(), but doesn't return until the file
 *		system buffer cache has been flushed.
537 538 539 540
 */
int
mdflush(Relation reln, BlockNumber blocknum, char *buffer)
{
541 542 543
	int			status;
	long		seekpos;
	MdfdVec    *v;
544

545
	v = _mdfd_getseg(reln, blocknum);
546

547
#ifndef LET_OS_MANAGE_FILESIZE
548
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
549
#ifdef DIAGNOSTIC
550 551
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
552
#endif
553 554 555
#else
	seekpos = (long) (BLCKSZ * (blocknum));
#endif
556

557
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
558
		return SM_FAIL;
559

560 561 562 563 564
	/* write and sync the block */
	status = SM_SUCCESS;
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
		|| FileSync(v->mdfd_vfd) < 0)
		status = SM_FAIL;
565

566
	return status;
567 568 569
}

/*
570
 *	mdblindwrt() -- Write a block to disk blind.
571
 *
572
 *		We have to be able to do this using only the name and OID of
573
 *		the database and relation in which the block belongs.  Otherwise
574 575
 *		this is much like mdwrite().  If dofsync is TRUE, then we fsync
 *		the file, making it more like mdflush().
576 577
 */
int
578 579
mdblindwrt(char *dbname,
		   char *relname,
580 581 582
		   Oid dbid,
		   Oid relid,
		   BlockNumber blkno,
583 584
		   char *buffer,
		   bool dofsync)
585
{
586
	int			status;
587
	long		seekpos;
588
	int			fd;
589

590
	fd = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno);
591

592
	if (fd < 0)
593
		return SM_FAIL;
594

595
#ifndef LET_OS_MANAGE_FILESIZE
596
	seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
597 598 599 600
#ifdef DIAGNOSTIC
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
#endif
601 602 603 604
#else
	seekpos = (long) (BLCKSZ * (blkno));
#endif

605 606 607
	if (lseek(fd, seekpos, SEEK_SET) != seekpos)
	{
		close(fd);
608
		return SM_FAIL;
609
	}
610

611
	status = SM_SUCCESS;
612 613 614 615 616 617 618 619 620

	/* write and optionally sync the block */
	if (write(fd, buffer, BLCKSZ) != BLCKSZ)
		status = SM_FAIL;
	else if (dofsync &&
			 pg_fsync(fd) < 0)
		status = SM_FAIL;

	if (close(fd) < 0)
621
		status = SM_FAIL;
622

623 624
	return status;
}
625

626 627 628 629 630 631 632 633 634
/*
 *	mdmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync).
 *
 *		Returns SM_SUCCESS or SM_FAIL.
 */
int
mdmarkdirty(Relation reln, BlockNumber blkno)
{
	MdfdVec    *v;
635

636 637 638 639 640 641 642 643 644 645 646 647
	v = _mdfd_getseg(reln, blkno);

	FileMarkDirty(v->mdfd_vfd);

	return SM_SUCCESS;
}

/*
 *	mdblindmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync).
 *
 *		We have to be able to do this using only the name and OID of
 *		the database and relation in which the block belongs.  Otherwise
648 649
 *		this is much like mdmarkdirty().  However, we do the fsync immediately
 *		rather than building md/fd datastructures to postpone it till later.
650 651 652 653 654 655 656 657
 */
int
mdblindmarkdirty(char *dbname,
				 char *relname,
				 Oid dbid,
				 Oid relid,
				 BlockNumber blkno)
{
658 659
	int			status;
	int			fd;
660

661
	fd = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno);
662

663
	if (fd < 0)
664 665
		return SM_FAIL;

666
	status = SM_SUCCESS;
667

668 669 670 671 672 673 674
	if (pg_fsync(fd) < 0)
		status = SM_FAIL;

	if (close(fd) < 0)
		status = SM_FAIL;

	return status;
675 676 677
}

/*
678
 *	mdnblocks() -- Get the number of blocks stored in a relation.
679
 *
680 681 682 683 684 685
 *		Important side effect: all segments of the relation are opened
 *		and added to the mdfd_chain list.  If this routine has not been
 *		called, then only segments up to the last one actually touched
 *		are present in the chain...
 *
 *		Returns # of blocks, elog's on error.
686 687 688 689
 */
int
mdnblocks(Relation reln)
{
690 691
	int			fd;
	MdfdVec    *v;
692
#ifndef LET_OS_MANAGE_FILESIZE
693 694
	int			nblocks;
	int			segno;
695
#endif
696

697
	fd = _mdfd_getrelnfd(reln);
698
	v = &Md_fdvec[fd];
699

700
#ifndef LET_OS_MANAGE_FILESIZE
701 702 703
	segno = 0;
	for (;;)
	{
704 705 706 707 708
		nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ);
		if (nblocks > RELSEG_SIZE)
			elog(FATAL, "segment too big in mdnblocks!");
		v->mdfd_lstbcnt = nblocks;
		if (nblocks == RELSEG_SIZE)
709 710 711 712 713 714 715
		{
			segno++;

			if (v->mdfd_chain == (MdfdVec *) NULL)
			{
				v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
				if (v->mdfd_chain == (MdfdVec *) NULL)
716
					elog(ERROR, "cannot count blocks for %s -- open failed",
717 718 719 720 721 722
						 RelationGetRelationName(reln));
			}

			v = v->mdfd_chain;
		}
		else
723
			return (segno * RELSEG_SIZE) + nblocks;
724
	}
725
#else
726
	return _mdnblocks(v->mdfd_vfd, BLCKSZ);
727
#endif
728 729
}

730
/*
731
 *	mdtruncate() -- Truncate relation to specified number of blocks.
732
 *
733
 *		Returns # of blocks or -1 on error.
734 735
 */
int
736
mdtruncate(Relation reln, int nblocks)
737
{
738
	int			curnblk;
739 740
	int			fd;
	MdfdVec    *v;
741
#ifndef LET_OS_MANAGE_FILESIZE
742 743 744
	MemoryContext oldcxt;
	int			priorblocks;
#endif
745

746 747 748
	/* NOTE: mdnblocks makes sure we have opened all existing segments,
	 * so that truncate/delete loop will get them all!
	 */
749
	curnblk = mdnblocks(reln);
750 751 752 753
	if (nblocks < 0 || nblocks > curnblk)
		return -1;				/* bogus request */
	if (nblocks == curnblk)
		return nblocks;			/* no work */
754

755
	fd = _mdfd_getrelnfd(reln);
756
	v = &Md_fdvec[fd];
757

758
#ifndef LET_OS_MANAGE_FILESIZE
759 760 761
	oldcxt = MemoryContextSwitchTo(MdCxt);
	priorblocks = 0;
	while (v != (MdfdVec *) NULL)
762
	{
763 764 765
		MdfdVec    *ov = v;

		if (priorblocks > nblocks)
766
		{
767 768 769 770 771 772 773 774 775 776 777 778
			/* This segment is no longer wanted at all (and has already been
			 * unlinked from the mdfd_chain).
			 * We truncate the file before deleting it because if other
			 * backends are holding the file open, the unlink will fail on
			 * some platforms.  Better a zero-size file gets left around than
			 * a big file...
			 */
			FileTruncate(v->mdfd_vfd, 0);
			FileUnlink(v->mdfd_vfd);
			v = v->mdfd_chain;
			Assert(ov != &Md_fdvec[fd]); /* we never drop the 1st segment */
			pfree(ov);
779
		}
780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803
		else if (priorblocks + RELSEG_SIZE > nblocks)
		{
			/* This is the last segment we want to keep.
			 * Truncate the file to the right length, and clear chain link
			 * that points to any remaining segments (which we shall zap).
			 * NOTE: if nblocks is exactly a multiple K of RELSEG_SIZE,
			 * we will truncate the K+1st segment to 0 length but keep it.
			 * This is mainly so that the right thing happens if nblocks=0.
			 */
			int lastsegblocks = nblocks - priorblocks;
			if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)
				return -1;
			v->mdfd_lstbcnt = lastsegblocks;
			v = v->mdfd_chain;
			ov->mdfd_chain = (MdfdVec *) NULL;
		}
		else
		{
			/* We still need this segment and 0 or more blocks beyond it,
			 * so nothing to do here.
			 */
			v = v->mdfd_chain;
		}
		priorblocks += RELSEG_SIZE;
804
	}
805
	MemoryContextSwitchTo(oldcxt);
806
#else
807
	if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
808
		return -1;
809 810
	v->mdfd_lstbcnt = nblocks;
#endif
811

812
	return nblocks;
813

814
}	/* mdtruncate */
815

816
/*
817
 *	mdcommit() -- Commit a transaction.
818
 *
819 820 821 822
 *		All changes to magnetic disk relations must be forced to stable
 *		storage.  This routine makes a pass over the private table of
 *		file descriptors.  Any descriptors to which we have done writes,
 *		but not synced, are synced here.
823
 *
824
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
825 826 827 828
 */
int
mdcommit()
{
829 830 831 832 833 834
#ifdef XLOG
	sync();
	sleep(1);
	sync();
	return SM_SUCCESS;
#else
835 836
	int			i;
	MdfdVec    *v;
837

838 839
	for (i = 0; i < CurFd; i++)
	{
840 841 842
		v = &Md_fdvec[i];
		if (v->mdfd_flags & MDFD_FREE)
			continue;
843
		/* Sync the file entry */
844
#ifndef LET_OS_MANAGE_FILESIZE
845
		for ( ; v != (MdfdVec *) NULL; v = v->mdfd_chain)
846
#else
847
		if (v != (MdfdVec *) NULL)
848
#endif
849 850 851
		{
			if (FileSync(v->mdfd_vfd) < 0)
				return SM_FAIL;
852
		}
853 854
	}

855
	return SM_SUCCESS;
856
#endif	/* XLOG */
857 858 859
}

/*
860
 *	mdabort() -- Abort a transaction.
861
 *
862 863
 *		Changes need not be forced to disk at transaction abort.  We mark
 *		all file descriptors as clean here.  Always returns SM_SUCCESS.
864 865 866 867
 */
int
mdabort()
{
868 869 870
	/* We don't actually have to do anything here.  fd.c will discard
	 * fsync-needed bits in its AtEOXact_Files() routine.
	 */
871
	return SM_SUCCESS;
872 873 874
}

/*
875
 *	_fdvec_alloc () -- grab a free (or new) md file descriptor vector.
876 877 878
 *
 */
static
879 880
int
_fdvec_alloc()
881
{
882 883 884 885
	MdfdVec    *nvec;
	int			fdvec,
				i;
	MemoryContext oldcxt;
886 887

	if (Md_Free >= 0)			/* get from free list */
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
888
	{
889 890 891 892 893 894 895 896 897
		fdvec = Md_Free;
		Md_Free = Md_fdvec[fdvec].mdfd_nextFree;
		Assert(Md_fdvec[fdvec].mdfd_flags == MDFD_FREE);
		Md_fdvec[fdvec].mdfd_flags = 0;
		if (fdvec >= CurFd)
		{
			Assert(fdvec == CurFd);
			CurFd++;
		}
898
		return fdvec;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
899
	}
900

901 902 903 904
	/* Must allocate more room */

	if (Nfds != CurFd)
		elog(FATAL, "_fdvec_alloc error");
905

906
	Nfds *= 2;
907

908
	oldcxt = MemoryContextSwitchTo(MdCxt);
909

910
	nvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
Bruce Momjian's avatar
Bruce Momjian committed
911
	MemSet(nvec, 0, Nfds * sizeof(MdfdVec));
912 913
	memmove(nvec, (char *) Md_fdvec, CurFd * sizeof(MdfdVec));
	pfree(Md_fdvec);
914

915
	MemoryContextSwitchTo(oldcxt);
916

917
	Md_fdvec = nvec;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
918

919 920 921 922 923 924 925 926
	/* Set new free list */
	for (i = CurFd; i < Nfds; i++)
	{
		Md_fdvec[i].mdfd_nextFree = i + 1;
		Md_fdvec[i].mdfd_flags = MDFD_FREE;
	}
	Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
	Md_Free = CurFd + 1;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
927

928 929 930 931
	fdvec = CurFd;
	CurFd++;
	Md_fdvec[fdvec].mdfd_flags = 0;

932
	return fdvec;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
933 934 935
}

/*
936
 *	_fdvec_free () -- free md file descriptor vector.
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
937 938 939
 *
 */
static
940 941
void
_fdvec_free(int fdvec)
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
942
{
943 944

	Assert(Md_Free < 0 || Md_fdvec[Md_Free].mdfd_flags == MDFD_FREE);
945
	Assert(Md_fdvec[fdvec].mdfd_flags != MDFD_FREE);
946 947 948
	Md_fdvec[fdvec].mdfd_nextFree = Md_Free;
	Md_fdvec[fdvec].mdfd_flags = MDFD_FREE;
	Md_Free = fdvec;
949 950 951 952 953
}

static MdfdVec *
_mdfd_openseg(Relation reln, int segno, int oflags)
{
954 955 956 957 958
	MemoryContext oldcxt;
	MdfdVec    *v;
	int			fd;
	char	   *path,
			   *fullpath;
959 960

	/* be sure we have enough space for the '.segno', if any */
961
	path = relpath(RelationGetPhysicalRelationName(reln));
962 963 964 965 966

	if (segno > 0)
	{
		fullpath = (char *) palloc(strlen(path) + 12);
		sprintf(fullpath, "%s.%d", path, segno);
967
		pfree(path);
968 969 970 971 972
	}
	else
		fullpath = path;

	/* open the file */
973
#ifndef __CYGWIN32__
974
	fd = FileNameOpenFile(fullpath, O_RDWR | oflags, 0600);
975
#else
976
	fd = FileNameOpenFile(fullpath, O_RDWR | O_BINARY | oflags, 0600);
977
#endif
978

979
	pfree(fullpath);
980 981

	if (fd < 0)
982
		return (MdfdVec *) NULL;
983 984 985 986 987 988 989 990 991 992

	/* allocate an mdfdvec entry for it */
	oldcxt = MemoryContextSwitchTo(MdCxt);
	v = (MdfdVec *) palloc(sizeof(MdfdVec));
	MemoryContextSwitchTo(oldcxt);

	/* fill the entry */
	v->mdfd_vfd = fd;
	v->mdfd_flags = (uint16) 0;
	v->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
993 994
#ifndef LET_OS_MANAGE_FILESIZE
	v->mdfd_chain = (MdfdVec *) NULL;
995 996

#ifdef DIAGNOSTIC
997 998
	if (v->mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big on open!");
999
#endif
1000 1001
#endif

1002
	/* all done */
1003
	return v;
1004 1005
}

1006 1007 1008 1009
/* Get the fd for the relation, opening it if it's not already open */

static int
_mdfd_getrelnfd(Relation reln)
1010
{
1011
	int			fd;
1012 1013 1014 1015 1016

	fd = RelationGetFile(reln);
	if (fd < 0)
	{
		if ((fd = mdopen(reln)) < 0)
1017
			elog(ERROR, "cannot open relation %s",
1018 1019 1020
				 RelationGetRelationName(reln));
		reln->rd_fd = fd;
	}
1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034
	return fd;
}

/* Find the segment of the relation holding the specified block */

static MdfdVec *
_mdfd_getseg(Relation reln, int blkno)
{
	MdfdVec    *v;
	int			segno;
	int			fd;
	int			i;

	fd = _mdfd_getrelnfd(reln);
1035

1036
#ifndef LET_OS_MANAGE_FILESIZE
1037 1038 1039 1040 1041 1042 1043
	for (v = &Md_fdvec[fd], segno = blkno / RELSEG_SIZE, i = 1;
		 segno > 0;
		 i++, segno--)
	{

		if (v->mdfd_chain == (MdfdVec *) NULL)
		{
1044
			v->mdfd_chain = _mdfd_openseg(reln, i, O_CREAT);
1045 1046

			if (v->mdfd_chain == (MdfdVec *) NULL)
1047
				elog(ERROR, "cannot open segment %d of relation %s",
1048 1049 1050
					 i, RelationGetRelationName(reln));
		}
		v = v->mdfd_chain;
1051
	}
1052 1053 1054
#else
	v = &Md_fdvec[fd];
#endif
1055

1056
	return v;
1057 1058
}

1059 1060
/*
 * Find the segment of the relation holding the specified block.
1061
 *
1062 1063 1064 1065 1066 1067
 * This performs the same work as _mdfd_getseg() except that we must work
 * "blind" with no Relation struct.  We assume that we are not likely to
 * touch the same relation again soon, so we do not create an FD entry for
 * the relation --- we just open a kernel file descriptor which will be
 * used and promptly closed.  The return value is the kernel descriptor,
 * or -1 on failure.
1068 1069
 */

1070
static int
1071 1072 1073 1074 1075 1076 1077 1078 1079
_mdfd_blind_getseg(char *dbname, char *relname, Oid dbid, Oid relid,
				   int blkno)
{
	char	   *path;
	int			fd;
#ifndef LET_OS_MANAGE_FILESIZE
	int			segno;
#endif

1080
	/* construct the path to the relation */
1081 1082 1083
	path = relpath_blind(dbname, relname, dbid, relid);

#ifndef LET_OS_MANAGE_FILESIZE
1084 1085 1086
	/* append the '.segno', if needed */
	segno = blkno / RELSEG_SIZE;
	if (segno > 0)
1087
	{
1088
		char   *segpath = (char *) palloc(strlen(path) + 12);
1089 1090

		sprintf(segpath, "%s.%d", path, segno);
1091 1092 1093
		pfree(path);
		path = segpath;
	}
1094 1095
#endif

1096 1097
#ifndef __CYGWIN32__
	fd = open(path, O_RDWR, 0600);
1098
#else
1099
	fd = open(path, O_RDWR | O_BINARY, 0600);
1100 1101 1102 1103
#endif

	pfree(path);

1104
	return fd;
1105 1106
}

1107
static BlockNumber
1108 1109
_mdnblocks(File file, Size blcksz)
{
1110
	long		len;
1111

1112 1113 1114
	len = FileSeek(file, 0L, SEEK_END);
	if (len < 0) return 0;	/* on failure, assume file is empty */
	return (BlockNumber) (len / blcksz);
1115
}