Commit c24dcd0c authored by Thomas Munro's avatar Thomas Munro

Use pg_pread() and pg_pwrite() for data files and WAL.

Cut down on system calls by doing random I/O using offset-based OS
routines where available.  Remove the code for tracking the 'virtual'
seek position.  The only reason left to call FileSeek() was to get
the file's size, so provide a new function FileSize() instead.

Author: Oskari Saarenmaa, Thomas Munro
Reviewed-by: Thomas Munro, Jesper Pedersen, Tom Lane, Alvaro Herrera
Discussion: https://postgr.es/m/CAEepm=02rapCpPR3ZGF2vW=SBHSdFYO_bz_f-wwWJonmA3APgw@mail.gmail.com
Discussion: https://postgr.es/m/b8748d39-0b19-0514-a1b9-4e5a28e6a208%40gmail.com
Discussion: https://postgr.es/m/a86bd200-ebbe-d829-e3ca-0c4474b2fcb7%40ohmu.fi
parent 3fd2a793
......@@ -935,7 +935,7 @@ logical_heap_rewrite_flush_mappings(RewriteState state)
* Note that we deviate from the usual WAL coding practices here,
* check the above "Logical rewrite support" comment for reasoning.
*/
written = FileWrite(src->vfd, waldata_start, len,
written = FileWrite(src->vfd, waldata_start, len, src->off,
WAIT_EVENT_LOGICAL_REWRITE_WRITE);
if (written != len)
ereport(ERROR,
......
......@@ -2478,18 +2478,6 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
Size nleft;
int written;
/* Need to seek in the file? */
if (openLogOff != startoffset)
{
if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not seek in log file %s to offset %u: %m",
XLogFileNameP(ThisTimeLineID, openLogSegNo),
startoffset)));
openLogOff = startoffset;
}
/* OK to write the page(s) */
from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
nbytes = npages * (Size) XLOG_BLCKSZ;
......@@ -2498,7 +2486,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
{
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
written = write(openLogFile, from, nleft);
written = pg_pwrite(openLogFile, from, nleft, startoffset);
pgstat_report_wait_end();
if (written <= 0)
{
......@@ -2513,6 +2501,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
}
nleft -= written;
from += written;
startoffset += written;
} while (nleft > 0);
/* Update state for write */
......@@ -11821,22 +11810,9 @@ retry:
/* Read the requested page */
readOff = targetPageOff;
if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
{
char fname[MAXFNAMELEN];
int save_errno = errno;
XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
errno = save_errno;
ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
(errcode_for_file_access(),
errmsg("could not seek in log segment %s to offset %u: %m",
fname, readOff)));
goto next_record_is_invalid;
}
pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
r = read(readFile, readBuf, XLOG_BLCKSZ);
r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
if (r != XLOG_BLCKSZ)
{
char fname[MAXFNAMELEN];
......
......@@ -67,12 +67,6 @@ struct BufFile
int numFiles; /* number of physical files in set */
/* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
File *files; /* palloc'd array with numFiles entries */
off_t *offsets; /* palloc'd array with numFiles entries */
/*
* offsets[i] is the current seek position of files[i]. We use this to
* avoid making redundant FileSeek calls.
*/
bool isInterXact; /* keep open over transactions? */
bool dirty; /* does buffer need to be written? */
......@@ -116,7 +110,6 @@ makeBufFileCommon(int nfiles)
BufFile *file = (BufFile *) palloc(sizeof(BufFile));
file->numFiles = nfiles;
file->offsets = (off_t *) palloc0(sizeof(off_t) * nfiles);
file->isInterXact = false;
file->dirty = false;
file->resowner = CurrentResourceOwner;
......@@ -170,10 +163,7 @@ extendBufFile(BufFile *file)
file->files = (File *) repalloc(file->files,
(file->numFiles + 1) * sizeof(File));
file->offsets = (off_t *) repalloc(file->offsets,
(file->numFiles + 1) * sizeof(off_t));
file->files[file->numFiles] = pfile;
file->offsets[file->numFiles] = 0L;
file->numFiles++;
}
......@@ -396,7 +386,6 @@ BufFileClose(BufFile *file)
FileClose(file->files[i]);
/* release the buffer space */
pfree(file->files);
pfree(file->offsets);
pfree(file);
}
......@@ -422,27 +411,17 @@ BufFileLoadBuffer(BufFile *file)
file->curOffset = 0L;
}
/*
* May need to reposition physical file.
*/
thisfile = file->files[file->curFile];
if (file->curOffset != file->offsets[file->curFile])
{
if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset)
return; /* seek failed, read nothing */
file->offsets[file->curFile] = file->curOffset;
}
/*
* Read whatever we can get, up to a full bufferload.
*/
thisfile = file->files[file->curFile];
file->nbytes = FileRead(thisfile,
file->buffer.data,
sizeof(file->buffer),
file->curOffset,
WAIT_EVENT_BUFFILE_READ);
if (file->nbytes < 0)
file->nbytes = 0;
file->offsets[file->curFile] += file->nbytes;
/* we choose not to advance curOffset here */
if (file->nbytes > 0)
......@@ -491,23 +470,14 @@ BufFileDumpBuffer(BufFile *file)
if ((off_t) bytestowrite > availbytes)
bytestowrite = (int) availbytes;
/*
* May need to reposition physical file.
*/
thisfile = file->files[file->curFile];
if (file->curOffset != file->offsets[file->curFile])
{
if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset)
return; /* seek failed, give up */
file->offsets[file->curFile] = file->curOffset;
}
bytestowrite = FileWrite(thisfile,
file->buffer.data + wpos,
bytestowrite,
file->curOffset,
WAIT_EVENT_BUFFILE_WRITE);
if (bytestowrite <= 0)
return; /* failed to write */
file->offsets[file->curFile] += bytestowrite;
file->curOffset += bytestowrite;
wpos += bytestowrite;
......@@ -803,11 +773,10 @@ BufFileSize(BufFile *file)
{
off_t lastFileSize;
/* Get the size of the last physical file by seeking to end. */
lastFileSize = FileSeek(file->files[file->numFiles - 1], 0, SEEK_END);
/* Get the size of the last physical file. */
lastFileSize = FileSize(file->files[file->numFiles - 1]);
if (lastFileSize < 0)
return -1;
file->offsets[file->numFiles - 1] = lastFileSize;
return ((file->numFiles - 1) * (off_t) MAX_PHYSICAL_FILESIZE) +
lastFileSize;
......@@ -849,13 +818,8 @@ BufFileAppend(BufFile *target, BufFile *source)
target->files = (File *)
repalloc(target->files, sizeof(File) * newNumFiles);
target->offsets = (off_t *)
repalloc(target->offsets, sizeof(off_t) * newNumFiles);
for (i = target->numFiles; i < newNumFiles; i++)
{
target->files[i] = source->files[i - target->numFiles];
target->offsets[i] = source->offsets[i - target->numFiles];
}
target->numFiles = newNumFiles;
return startBlock;
......
This diff is collapsed.
......@@ -522,22 +522,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
/*
* Note: because caller usually obtained blocknum by calling mdnblocks,
* which did a seek(SEEK_END), this seek is often redundant and will be
* optimized away by fd.c. It's not redundant, however, if there is a
* partial page at the end of the file. In that case we want to try to
* overwrite the partial page with a full page. It's also not redundant
* if bufmgr.c had to dump another buffer of the same file to make room
* for the new page's buffer.
*/
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not seek to block %u in file \"%s\": %m",
blocknum, FilePathName(v->mdfd_vfd))));
if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
{
if (nbytes < 0)
ereport(ERROR,
......@@ -748,13 +733,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not seek to block %u in file \"%s\": %m",
blocknum, FilePathName(v->mdfd_vfd))));
nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_READ);
nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ);
TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
reln->smgr_rnode.node.spcNode,
......@@ -824,13 +803,7 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not seek to block %u in file \"%s\": %m",
blocknum, FilePathName(v->mdfd_vfd))));
nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_WRITE);
nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE);
TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
reln->smgr_rnode.node.spcNode,
......@@ -1979,7 +1952,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
{
off_t len;
len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
len = FileSize(seg->mdfd_vfd);
if (len < 0)
ereport(ERROR,
(errcode_for_file_access(),
......
......@@ -15,7 +15,7 @@
/*
* calls:
*
* File {Close, Read, Write, Seek, Tell, Sync}
* File {Close, Read, Write, Size, Sync}
* {Path Name Open, Allocate, Free} File
*
* These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
......@@ -42,10 +42,6 @@
#include <dirent.h>
/*
* FileSeek uses the standard UNIX lseek(2) flags.
*/
typedef int File;
......@@ -68,10 +64,10 @@ extern File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fil
extern File OpenTemporaryFile(bool interXact);
extern void FileClose(File file);
extern int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info);
extern int FileRead(File file, char *buffer, int amount, uint32 wait_event_info);
extern int FileWrite(File file, char *buffer, int amount, uint32 wait_event_info);
extern int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info);
extern int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info);
extern int FileSync(File file, uint32 wait_event_info);
extern off_t FileSeek(File file, off_t offset, int whence);
extern off_t FileSize(File file);
extern int FileTruncate(File file, off_t offset, uint32 wait_event_info);
extern void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info);
extern char *FilePathName(File file);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment