md.c 18.6 KB
Newer Older
1 2 3
/*-------------------------------------------------------------------------
 *
 * md.c--
4
 *	  This code manages relations that reside on magnetic disk.
5 6 7 8 9
 *
 * Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
10
 *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.28 1998/02/23 13:58:04 vadim Exp $
11 12 13
 *
 *-------------------------------------------------------------------------
 */
Bruce Momjian's avatar
Bruce Momjian committed
14
#include <unistd.h>
15
#include <stdio.h>				/* for sprintf() */
Bruce Momjian's avatar
Bruce Momjian committed
16
#include <string.h>
17
#include <fcntl.h>				/* for open() flags */
18 19 20
#include <sys/file.h>

#include "postgres.h"
21
#include "miscadmin.h"			/* for DataDir */
22

23
#include "catalog/catalog.h"
24
#include "storage/block.h"
Bruce Momjian's avatar
Bruce Momjian committed
25
#include "storage/fd.h"
26
#include "storage/smgr.h"		/* where the declarations go */
27 28 29 30 31 32
#include "utils/mcxt.h"
#include "utils/rel.h"

#undef DIAGNOSTIC

/*
33 34 35 36 37 38 39
 *	The magnetic disk storage manager keeps track of open file descriptors
 *	in its own descriptor pool.  This happens for two reasons.	First, at
 *	transaction boundaries, we walk the list of descriptors and flush
 *	anything that we've dirtied in the current transaction.  Second, we
 *	have to support relations of > 4GBytes.  In order to do this, we break
 *	relations up into chunks of < 2GBytes and store one chunk in each of
 *	several files that represent the relation.
40 41
 */

42 43
typedef struct _MdfdVec
{
44 45 46 47
	int			mdfd_vfd;		/* fd number in vfd pool */
	uint16		mdfd_flags;		/* clean, dirty, free */
	int			mdfd_lstbcnt;	/* most recent block count */
	int			mdfd_nextFree;	/* next free vector */
48
	struct _MdfdVec *mdfd_chain;/* for large relations */
49
} MdfdVec;
50

51
static int	Nfds = 100;
52
static MdfdVec *Md_fdvec = (MdfdVec *) NULL;
53 54
static int	Md_Free = -1;
static int	CurFd = 0;
55
static MemoryContext MdCxt;
56

57 58
#define MDFD_DIRTY		(uint16) 0x01
#define MDFD_FREE		(uint16) 0x02
59

60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
/*
 * RELSEG_SIZE appears to be the number of segments that can
 * be in a disk file.  It was defined as 262144 based on 8k
 * blocks, but now that the block size can be changed, this
 * has to be calculated at compile time.  Otherwise, the file
 * size limit would not work out to 2-gig (2147483648).
 *
 * The number needs to be (2 ** 31) / BLCKSZ, but to be keep
 * the math under MAXINT, pre-divide by 256 and use ...
 *
 *           (((2 ** 23) / BLCKSZ) * (2 ** 8))
 *
 * 07 Jan 98  darrenk
 */

#define RELSEG_SIZE		((8388608 / BLCKSZ) * 256)
76 77

/* routines declared here */
78 79
static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);
static MdfdVec *_mdfd_getseg(Relation reln, int blkno, int oflag);
80 81
static int	_fdvec_alloc(void);
static void _fdvec_free(int);
82 83 84
static BlockNumber _mdnblocks(File file, Size blcksz);

/*
85
 *	mdinit() -- Initialize private state for magnetic disk storage manager.
86
 *
87 88 89 90 91
 *		We keep a private table of all file descriptors.  Whenever we do
 *		a write to one, we mark it dirty in our table.	Whenever we force
 *		changes to disk, we mark the file descriptor clean.  At transaction
 *		commit, we force changes to disk for all dirty file descriptors.
 *		This routine allocates and initializes the table.
92
 *
93
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
94 95 96 97
 */
int
mdinit()
{
98 99
	MemoryContext oldcxt;
	int			i;
100

101 102 103
	MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr");
	if (MdCxt == (MemoryContext) NULL)
		return (SM_FAIL);
104

105 106 107
	oldcxt = MemoryContextSwitchTo(MdCxt);
	Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
	MemoryContextSwitchTo(oldcxt);
108

109 110
	if (Md_fdvec == (MdfdVec *) NULL)
		return (SM_FAIL);
111

Bruce Momjian's avatar
Bruce Momjian committed
112
	MemSet(Md_fdvec, 0, Nfds * sizeof(MdfdVec));
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
113

114 115 116 117 118 119 120 121
	/* Set free list */
	for (i = 0; i < Nfds; i++)
	{
		Md_fdvec[i].mdfd_nextFree = i + 1;
		Md_fdvec[i].mdfd_flags = MDFD_FREE;
	}
	Md_Free = 0;
	Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
122

123
	return (SM_SUCCESS);
124 125 126 127 128
}

int
mdcreate(Relation reln)
{
129 130 131
	int			fd,
				vfd;
	char	   *path;
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165

	path = relpath(&(reln->rd_rel->relname.data[0]));
	fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);

	/*
	 * If the file already exists and is empty, we pretend that the create
	 * succeeded.  During bootstrap processing, we skip that check,
	 * because pg_time, pg_variable, and pg_log get created before their
	 * .bki file entries are processed.
	 *
	 * As the result of this pretence it was possible to have in pg_class > 1
	 * records with the same relname. Actually, it should be fixed in
	 * upper levels, too, but... -	vadim 05/06/97
	 */

	if (fd < 0)
	{
		if (!IsBootstrapProcessingMode())
			return (-1);
		fd = FileNameOpenFile(path, O_RDWR, 0600);		/* Bootstrap */
		if (fd < 0)
			return (-1);
	}

	vfd = _fdvec_alloc();
	if (vfd < 0)
		return (-1);

	Md_fdvec[vfd].mdfd_vfd = fd;
	Md_fdvec[vfd].mdfd_flags = (uint16) 0;
	Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
	Md_fdvec[vfd].mdfd_lstbcnt = 0;

	return (vfd);
166 167 168
}

/*
169
 *	mdunlink() -- Unlink a relation.
170 171 172 173
 */
int
mdunlink(Relation reln)
{
174 175 176 177 178 179 180
	int			fd;
	int			i;
	MdfdVec    *v,
			   *ov;
	MemoryContext oldcxt;
	char		fname[NAMEDATALEN];
	char		tname[NAMEDATALEN + 10];		/* leave room for overflow
181 182 183 184 185 186 187
												 * suffixes */

	/*
	 * On Windows NT you can't unlink a file if it is open so we have * to
	 * do this.
	 */

188
	StrNCpy(fname, RelationGetRelationName(reln)->data, NAMEDATALEN);
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215

	if (FileNameUnlink(fname) < 0)
		return (SM_FAIL);

	/* unlink all the overflow files for large relations */
	for (i = 1;; i++)
	{
		sprintf(tname, "%s.%d", fname, i);
		if (FileNameUnlink(tname) < 0)
			break;
	}

	/* finally, clean out the mdfd vector */
	fd = RelationGetFile(reln);
	Md_fdvec[fd].mdfd_flags = (uint16) 0;

	oldcxt = MemoryContextSwitchTo(MdCxt);
	for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
	{
		FileUnlink(v->mdfd_vfd);
		ov = v;
		v = v->mdfd_chain;
		if (ov != &Md_fdvec[fd])
			pfree(ov);
	}
	Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
	MemoryContextSwitchTo(oldcxt);
216

217 218 219
	_fdvec_free(fd);

	return (SM_SUCCESS);
220 221 222
}

/*
223
 *	mdextend() -- Add a block to the specified relation.
224
 *
225 226
 *		This routine returns SM_FAIL or SM_SUCCESS, with errno set as
 *		appropriate.
227 228 229 230
 */
int
mdextend(Relation reln, char *buffer)
{
231 232 233
	long		pos;
	int			nblocks;
	MdfdVec    *v;
234

235 236
	nblocks = mdnblocks(reln);
	v = _mdfd_getseg(reln, nblocks, O_CREAT);
237

238 239
	if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0)
		return (SM_FAIL);
240

241 242
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
		return (SM_FAIL);
243

244 245
	/* remember that we did a write, so we can sync at xact commit */
	v->mdfd_flags |= MDFD_DIRTY;
246

247 248 249
	/* try to keep the last block count current, though it's just a hint */
	if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
		v->mdfd_lstbcnt = RELSEG_SIZE;
250 251

#ifdef DIAGNOSTIC
252 253 254
	if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE
		|| v->mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big!");
255 256
#endif

257
	return (SM_SUCCESS);
258 259 260
}

/*
261
 *	mdopen() -- Open the specified relation.
262 263 264 265
 */
int
mdopen(Relation reln)
{
266 267 268
	char	   *path;
	int			fd;
	int			vfd;
269

270
	path = relpath(&(reln->rd_rel->relname.data[0]));
271

272
	fd = FileNameOpenFile(path, O_RDWR, 0600);
273

274 275 276
	/* this should only happen during bootstrap processing */
	if (fd < 0)
		fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);
277

278 279 280
	vfd = _fdvec_alloc();
	if (vfd < 0)
		return (-1);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
281

282 283 284 285
	Md_fdvec[vfd].mdfd_vfd = fd;
	Md_fdvec[vfd].mdfd_flags = (uint16) 0;
	Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
	Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
286 287

#ifdef DIAGNOSTIC
288 289
	if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big on relopen!");
290 291
#endif

292
	return (vfd);
293 294 295
}

/*
296
 *	mdclose() -- Close the specified relation
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
297
 *
298 299
 *		AND FREE fd vector! It may be re-used for other relation!
 *		reln should be flushed from cache after closing !..
300
 *
301
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
302 303 304 305
 */
int
mdclose(Relation reln)
{
306 307 308 309
	int			fd;
	MdfdVec    *v,
			   *ov;
	MemoryContext oldcxt;
310

311
	fd = RelationGetFile(reln);
312

313 314
	oldcxt = MemoryContextSwitchTo(MdCxt);
	for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
315
	{
316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
		/* if not closed already */
		if (v->mdfd_vfd >= 0)
		{

			/*
			 * We sync the file descriptor so that we don't need to reopen
			 * it at transaction commit to force changes to disk.
			 */

			FileSync(v->mdfd_vfd);
			FileClose(v->mdfd_vfd);

			/* mark this file descriptor as clean in our private table */
			v->mdfd_flags &= ~MDFD_DIRTY;
		}
		/* Now free vector */
		ov = v;
		v = v->mdfd_chain;
		if (ov != &Md_fdvec[fd])
			pfree(ov);
	}
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
337

338 339
	MemoryContextSwitchTo(oldcxt);
	Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
340

341 342 343
	_fdvec_free(fd);

	return (SM_SUCCESS);
344 345 346
}

/*
347
 *	mdread() -- Read the specified block from a relation.
348
 *
349
 *		Returns SM_SUCCESS or SM_FAIL.
350 351 352 353
 */
int
mdread(Relation reln, BlockNumber blocknum, char *buffer)
{
354 355 356 357
	int			status;
	long		seekpos;
	int			nbytes;
	MdfdVec    *v;
358

359
	v = _mdfd_getseg(reln, blocknum, 0);
360

361
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
362 363

#ifdef DIAGNOSTIC
364 365
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
366 367
#endif

368 369 370 371
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
	{
		return (SM_FAIL);
	}
372

373 374 375 376 377
	status = SM_SUCCESS;
	if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
	{
		if (nbytes == 0)
		{
Bruce Momjian's avatar
Bruce Momjian committed
378
			MemSet(buffer, 0, BLCKSZ);
379 380 381 382 383
		}
		else
		{
			status = SM_FAIL;
		}
384 385
	}

386
	return (status);
387 388 389
}

/*
390
 *	mdwrite() -- Write the supplied block at the appropriate location.
391
 *
392
 *		Returns SM_SUCCESS or SM_FAIL.
393 394 395 396
 */
int
mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
{
397 398 399
	int			status;
	long		seekpos;
	MdfdVec    *v;
400

401
	v = _mdfd_getseg(reln, blocknum, 0);
402

403
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
404
#ifdef DIAGNOSTIC
405 406
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
407 408
#endif

409 410 411 412
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
	{
		return (SM_FAIL);
	}
413

414 415 416
	status = SM_SUCCESS;
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
		status = SM_FAIL;
417

418
	v->mdfd_flags |= MDFD_DIRTY;
419

420
	return (status);
421 422 423
}

/*
424
 *	mdflush() -- Synchronously write a block to disk.
425
 *
426 427
 *		This is exactly like mdwrite(), but doesn't return until the file
 *		system buffer cache has been flushed.
428 429 430 431
 */
int
mdflush(Relation reln, BlockNumber blocknum, char *buffer)
{
432 433 434
	int			status;
	long		seekpos;
	MdfdVec    *v;
435

436
	v = _mdfd_getseg(reln, blocknum, 0);
437

438
	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
439
#ifdef DIAGNOSTIC
440 441
	if (seekpos >= BLCKSZ * RELSEG_SIZE)
		elog(FATAL, "seekpos too big!");
442 443
#endif

444 445 446 447
	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
	{
		return (SM_FAIL);
	}
448

449 450 451 452 453
	/* write and sync the block */
	status = SM_SUCCESS;
	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
		|| FileSync(v->mdfd_vfd) < 0)
		status = SM_FAIL;
454

455 456 457 458 459
	/*
	 * By here, the block is written and changes have been forced to
	 * stable storage.	Mark the descriptor as clean until the next write,
	 * so we don't sync it again unnecessarily at transaction commit.
	 */
460

461
	v->mdfd_flags &= ~MDFD_DIRTY;
462

463
	return (status);
464 465 466
}

/*
467
 *	mdblindwrt() -- Write a block to disk blind.
468
 *
469 470 471
 *		We have to be able to do this using only the name and OID of
 *		the database and relation in which the block belongs.  This
 *		is a synchronous write.
472 473 474
 */
int
mdblindwrt(char *dbstr,
475 476 477 478 479
		   char *relstr,
		   Oid dbid,
		   Oid relid,
		   BlockNumber blkno,
		   char *buffer)
480
{
481 482 483 484 485 486
	int			fd;
	int			segno;
	long		seekpos;
	int			status;
	char	   *path;
	int			nchars;
487 488 489 490 491

	/* be sure we have enough space for the '.segno', if any */
	segno = blkno / RELSEG_SIZE;
	if (segno > 0)
		nchars = 10;
492
	else
493 494 495
		nchars = 0;

	/* construct the path to the file and open it */
496
	/* system table? then put in system area... */
497 498 499 500 501 502 503 504
	if (dbid == (Oid) 0)
	{
		path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
		if (segno == 0)
			sprintf(path, "%s/%s", DataDir, relstr);
		else
			sprintf(path, "%s/%s.%d", DataDir, relstr, segno);
	}
505
	/* user table? then put in user database area... */
506
	else if (dbid == MyDatabaseId)
507
	{
508 509 510
		extern char	   *DatabasePath;
		
		path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2 + nchars);
511
		if (segno == 0)
512
			sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
513
		else
514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529
			sprintf(path, "%s%c%s.%d", DatabasePath, SEP_CHAR, relstr, segno);
	}
	else	/* this is work arround only !!! */
	{
		char	dbpath[MAXPGPATH+1];
		Oid		owner, id;
		char   *tmpPath;
		
		GetRawDatabaseInfo(dbstr, &owner, &id, dbpath);
		
		if (id != dbid)
			elog (FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
		tmpPath = ExpandDatabasePath(dbpath);
		if (tmpPath == NULL)
			elog (FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
		path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2 + nchars);
530
		if (segno == 0)
531
			sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
532
		else
533 534
			sprintf(path, "%s%c%s.%d", tmpPath, SEP_CHAR, relstr, segno);
		pfree (tmpPath);
535
	}
536

537 538
	if ((fd = open(path, O_RDWR, 0600)) < 0)
		return (SM_FAIL);
539

540 541 542 543 544 545 546
	/* seek to the right spot */
	seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
	if (lseek(fd, seekpos, SEEK_SET) != seekpos)
	{
		close(fd);
		return (SM_FAIL);
	}
547

548
	status = SM_SUCCESS;
549

550 551 552
	/* write and sync the block */
	if (write(fd, buffer, BLCKSZ) != BLCKSZ || (pg_fsync(fd) < 0))
		status = SM_FAIL;
553

554 555
	if (close(fd) < 0)
		status = SM_FAIL;
556

557
	pfree(path);
558

559
	return (status);
560 561 562
}

/*
563
 *	mdnblocks() -- Get the number of blocks stored in a relation.
564
 *
565
 *		Returns # of blocks or -1 on error.
566 567 568 569
 */
int
mdnblocks(Relation reln)
{
570 571 572 573
	int			fd;
	MdfdVec    *v;
	int			nblocks;
	int			segno;
574

575 576
	fd = RelationGetFile(reln);
	v = &Md_fdvec[fd];
577 578

#ifdef DIAGNOSTIC
579 580
	if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE)
		elog(FATAL, "segment too big in getseg!");
581 582
#endif

583 584 585 586 587 588 589 590 591 592 593 594 595 596
	segno = 0;
	for (;;)
	{
		if (v->mdfd_lstbcnt == RELSEG_SIZE
			|| (nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ)) == RELSEG_SIZE)
		{

			v->mdfd_lstbcnt = RELSEG_SIZE;
			segno++;

			if (v->mdfd_chain == (MdfdVec *) NULL)
			{
				v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
				if (v->mdfd_chain == (MdfdVec *) NULL)
597
					elog(ERROR, "cannot count blocks for %.16s -- open failed",
598 599 600 601 602 603 604 605 606
						 RelationGetRelationName(reln));
			}

			v = v->mdfd_chain;
		}
		else
		{
			return ((segno * RELSEG_SIZE) + nblocks);
		}
607 608 609
	}
}

610
/*
611
 *	mdtruncate() -- Truncate relation to specified number of blocks.
612
 *
613
 *		Returns # of blocks or -1 on error.
614 615
 */
int
616
mdtruncate(Relation reln, int nblocks)
617
{
618 619 620
	int			fd;
	MdfdVec    *v;
	int			curnblk;
621

622 623 624 625 626 627 628 629 630 631
	curnblk = mdnblocks(reln);
	if (curnblk / RELSEG_SIZE > 0)
	{
		elog(NOTICE, "Can't truncate multi-segments relation %s",
			 &(reln->rd_rel->relname.data[0]));
		return (curnblk);
	}

	fd = RelationGetFile(reln);
	v = &Md_fdvec[fd];
632

633 634
	if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
		return (-1);
635

636
	return (nblocks);
637

638
}								/* mdtruncate */
639

640
/*
641
 *	mdcommit() -- Commit a transaction.
642
 *
643 644 645 646
 *		All changes to magnetic disk relations must be forced to stable
 *		storage.  This routine makes a pass over the private table of
 *		file descriptors.  Any descriptors to which we have done writes,
 *		but not synced, are synced here.
647
 *
648
 *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
649 650 651 652
 */
int
mdcommit()
{
653 654
	int			i;
	MdfdVec    *v;
655

656 657 658 659 660 661 662 663 664 665 666 667
	for (i = 0; i < CurFd; i++)
	{
		for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
		{
			if (v->mdfd_flags & MDFD_DIRTY)
			{
				if (FileSync(v->mdfd_vfd) < 0)
					return (SM_FAIL);

				v->mdfd_flags &= ~MDFD_DIRTY;
			}
		}
668 669
	}

670
	return (SM_SUCCESS);
671 672 673
}

/*
674
 *	mdabort() -- Abort a transaction.
675
 *
676 677
 *		Changes need not be forced to disk at transaction abort.  We mark
 *		all file descriptors as clean here.  Always returns SM_SUCCESS.
678 679 680 681
 */
int
mdabort()
{
682 683
	int			i;
	MdfdVec    *v;
684

685 686 687 688 689 690
	for (i = 0; i < CurFd; i++)
	{
		for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
		{
			v->mdfd_flags &= ~MDFD_DIRTY;
		}
691 692
	}

693
	return (SM_SUCCESS);
694 695 696
}

/*
697
 *	_fdvec_alloc () -- grab a free (or new) md file descriptor vector.
698 699 700
 *
 */
static
701 702
int
_fdvec_alloc()
703
{
704 705 706 707
	MdfdVec    *nvec;
	int			fdvec,
				i;
	MemoryContext oldcxt;
708 709

	if (Md_Free >= 0)			/* get from free list */
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
710
	{
711 712 713 714 715 716 717 718 719 720
		fdvec = Md_Free;
		Md_Free = Md_fdvec[fdvec].mdfd_nextFree;
		Assert(Md_fdvec[fdvec].mdfd_flags == MDFD_FREE);
		Md_fdvec[fdvec].mdfd_flags = 0;
		if (fdvec >= CurFd)
		{
			Assert(fdvec == CurFd);
			CurFd++;
		}
		return (fdvec);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
721
	}
722

723 724 725 726
	/* Must allocate more room */

	if (Nfds != CurFd)
		elog(FATAL, "_fdvec_alloc error");
727

728
	Nfds *= 2;
729

730
	oldcxt = MemoryContextSwitchTo(MdCxt);
731

732
	nvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
Bruce Momjian's avatar
Bruce Momjian committed
733
	MemSet(nvec, 0, Nfds * sizeof(MdfdVec));
734 735
	memmove(nvec, (char *) Md_fdvec, CurFd * sizeof(MdfdVec));
	pfree(Md_fdvec);
736

737
	MemoryContextSwitchTo(oldcxt);
738

739
	Md_fdvec = nvec;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
740

741 742 743 744 745 746 747 748
	/* Set new free list */
	for (i = CurFd; i < Nfds; i++)
	{
		Md_fdvec[i].mdfd_nextFree = i + 1;
		Md_fdvec[i].mdfd_flags = MDFD_FREE;
	}
	Md_fdvec[Nfds - 1].mdfd_nextFree = -1;
	Md_Free = CurFd + 1;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
749

750 751 752 753 754
	fdvec = CurFd;
	CurFd++;
	Md_fdvec[fdvec].mdfd_flags = 0;

	return (fdvec);
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
755 756 757
}

/*
758
 *	_fdvec_free () -- free md file descriptor vector.
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
759 760 761
 *
 */
static
762 763
void
_fdvec_free(int fdvec)
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
764
{
765 766 767 768 769

	Assert(Md_Free < 0 || Md_fdvec[Md_Free].mdfd_flags == MDFD_FREE);
	Md_fdvec[fdvec].mdfd_nextFree = Md_Free;
	Md_fdvec[fdvec].mdfd_flags = MDFD_FREE;
	Md_Free = fdvec;
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
770

771 772 773 774 775
}

static MdfdVec *
_mdfd_openseg(Relation reln, int segno, int oflags)
{
776 777 778 779 780 781
	MemoryContext oldcxt;
	MdfdVec    *v;
	int			fd;
	bool		dofree;
	char	   *path,
			   *fullpath;
782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814

	/* be sure we have enough space for the '.segno', if any */
	path = relpath(RelationGetRelationName(reln)->data);

	dofree = false;
	if (segno > 0)
	{
		dofree = true;
		fullpath = (char *) palloc(strlen(path) + 12);
		sprintf(fullpath, "%s.%d", path, segno);
	}
	else
		fullpath = path;

	/* open the file */
	fd = PathNameOpenFile(fullpath, O_RDWR | oflags, 0600);

	if (dofree)
		pfree(fullpath);

	if (fd < 0)
		return ((MdfdVec *) NULL);

	/* allocate an mdfdvec entry for it */
	oldcxt = MemoryContextSwitchTo(MdCxt);
	v = (MdfdVec *) palloc(sizeof(MdfdVec));
	MemoryContextSwitchTo(oldcxt);

	/* fill the entry */
	v->mdfd_vfd = fd;
	v->mdfd_flags = (uint16) 0;
	v->mdfd_chain = (MdfdVec *) NULL;
	v->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
815 816

#ifdef DIAGNOSTIC
817 818
	if (v->mdfd_lstbcnt > RELSEG_SIZE)
		elog(FATAL, "segment too big on open!");
819 820
#endif

821 822
	/* all done */
	return (v);
823 824 825 826 827
}

static MdfdVec *
_mdfd_getseg(Relation reln, int blkno, int oflag)
{
828 829 830 831
	MdfdVec    *v;
	int			segno;
	int			fd;
	int			i;
832 833 834 835 836

	fd = RelationGetFile(reln);
	if (fd < 0)
	{
		if ((fd = mdopen(reln)) < 0)
837
			elog(ERROR, "cannot open relation %.16s",
838 839 840 841 842 843 844 845 846 847 848 849 850 851
				 RelationGetRelationName(reln));
		reln->rd_fd = fd;
	}

	for (v = &Md_fdvec[fd], segno = blkno / RELSEG_SIZE, i = 1;
		 segno > 0;
		 i++, segno--)
	{

		if (v->mdfd_chain == (MdfdVec *) NULL)
		{
			v->mdfd_chain = _mdfd_openseg(reln, i, oflag);

			if (v->mdfd_chain == (MdfdVec *) NULL)
852
				elog(ERROR, "cannot open segment %d of relation %.16s",
853 854 855
					 i, RelationGetRelationName(reln));
		}
		v = v->mdfd_chain;
856 857
	}

858
	return (v);
859 860
}

861
static BlockNumber
862 863
_mdnblocks(File file, Size blcksz)
{
864
	long		len;
865 866 867

	len = FileSeek(file, 0L, SEEK_END) - 1;
	return ((BlockNumber) ((len < 0) ? 0 : 1 + len / blcksz));
868
}