Commit c24dcd0c authored by Thomas Munro's avatar Thomas Munro

Use pg_pread() and pg_pwrite() for data files and WAL.

Cut down on system calls by doing random I/O using offset-based OS
routines where available.  Remove the code for tracking the 'virtual'
seek position.  The only reason left to call FileSeek() was to get
the file's size, so provide a new function FileSize() instead.

Author: Oskari Saarenmaa, Thomas Munro
Reviewed-by: Thomas Munro, Jesper Pedersen, Tom Lane, Alvaro Herrera
Discussion: https://postgr.es/m/CAEepm=02rapCpPR3ZGF2vW=SBHSdFYO_bz_f-wwWJonmA3APgw@mail.gmail.com
Discussion: https://postgr.es/m/b8748d39-0b19-0514-a1b9-4e5a28e6a208%40gmail.com
Discussion: https://postgr.es/m/a86bd200-ebbe-d829-e3ca-0c4474b2fcb7%40ohmu.fi
parent 3fd2a793
...@@ -935,7 +935,7 @@ logical_heap_rewrite_flush_mappings(RewriteState state) ...@@ -935,7 +935,7 @@ logical_heap_rewrite_flush_mappings(RewriteState state)
* Note that we deviate from the usual WAL coding practices here, * Note that we deviate from the usual WAL coding practices here,
* check the above "Logical rewrite support" comment for reasoning. * check the above "Logical rewrite support" comment for reasoning.
*/ */
written = FileWrite(src->vfd, waldata_start, len, written = FileWrite(src->vfd, waldata_start, len, src->off,
WAIT_EVENT_LOGICAL_REWRITE_WRITE); WAIT_EVENT_LOGICAL_REWRITE_WRITE);
if (written != len) if (written != len)
ereport(ERROR, ereport(ERROR,
......
...@@ -2478,18 +2478,6 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) ...@@ -2478,18 +2478,6 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
Size nleft; Size nleft;
int written; int written;
/* Need to seek in the file? */
if (openLogOff != startoffset)
{
if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not seek in log file %s to offset %u: %m",
XLogFileNameP(ThisTimeLineID, openLogSegNo),
startoffset)));
openLogOff = startoffset;
}
/* OK to write the page(s) */ /* OK to write the page(s) */
from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ; from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
nbytes = npages * (Size) XLOG_BLCKSZ; nbytes = npages * (Size) XLOG_BLCKSZ;
...@@ -2498,7 +2486,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) ...@@ -2498,7 +2486,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
{ {
errno = 0; errno = 0;
pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
written = write(openLogFile, from, nleft); written = pg_pwrite(openLogFile, from, nleft, startoffset);
pgstat_report_wait_end(); pgstat_report_wait_end();
if (written <= 0) if (written <= 0)
{ {
...@@ -2513,6 +2501,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) ...@@ -2513,6 +2501,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
} }
nleft -= written; nleft -= written;
from += written; from += written;
startoffset += written;
} while (nleft > 0); } while (nleft > 0);
/* Update state for write */ /* Update state for write */
...@@ -11821,22 +11810,9 @@ retry: ...@@ -11821,22 +11810,9 @@ retry:
/* Read the requested page */ /* Read the requested page */
readOff = targetPageOff; readOff = targetPageOff;
if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
{
char fname[MAXFNAMELEN];
int save_errno = errno;
XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
errno = save_errno;
ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
(errcode_for_file_access(),
errmsg("could not seek in log segment %s to offset %u: %m",
fname, readOff)));
goto next_record_is_invalid;
}
pgstat_report_wait_start(WAIT_EVENT_WAL_READ); pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
r = read(readFile, readBuf, XLOG_BLCKSZ); r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
if (r != XLOG_BLCKSZ) if (r != XLOG_BLCKSZ)
{ {
char fname[MAXFNAMELEN]; char fname[MAXFNAMELEN];
......
...@@ -67,12 +67,6 @@ struct BufFile ...@@ -67,12 +67,6 @@ struct BufFile
int numFiles; /* number of physical files in set */ int numFiles; /* number of physical files in set */
/* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */ /* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
File *files; /* palloc'd array with numFiles entries */ File *files; /* palloc'd array with numFiles entries */
off_t *offsets; /* palloc'd array with numFiles entries */
/*
* offsets[i] is the current seek position of files[i]. We use this to
* avoid making redundant FileSeek calls.
*/
bool isInterXact; /* keep open over transactions? */ bool isInterXact; /* keep open over transactions? */
bool dirty; /* does buffer need to be written? */ bool dirty; /* does buffer need to be written? */
...@@ -116,7 +110,6 @@ makeBufFileCommon(int nfiles) ...@@ -116,7 +110,6 @@ makeBufFileCommon(int nfiles)
BufFile *file = (BufFile *) palloc(sizeof(BufFile)); BufFile *file = (BufFile *) palloc(sizeof(BufFile));
file->numFiles = nfiles; file->numFiles = nfiles;
file->offsets = (off_t *) palloc0(sizeof(off_t) * nfiles);
file->isInterXact = false; file->isInterXact = false;
file->dirty = false; file->dirty = false;
file->resowner = CurrentResourceOwner; file->resowner = CurrentResourceOwner;
...@@ -170,10 +163,7 @@ extendBufFile(BufFile *file) ...@@ -170,10 +163,7 @@ extendBufFile(BufFile *file)
file->files = (File *) repalloc(file->files, file->files = (File *) repalloc(file->files,
(file->numFiles + 1) * sizeof(File)); (file->numFiles + 1) * sizeof(File));
file->offsets = (off_t *) repalloc(file->offsets,
(file->numFiles + 1) * sizeof(off_t));
file->files[file->numFiles] = pfile; file->files[file->numFiles] = pfile;
file->offsets[file->numFiles] = 0L;
file->numFiles++; file->numFiles++;
} }
...@@ -396,7 +386,6 @@ BufFileClose(BufFile *file) ...@@ -396,7 +386,6 @@ BufFileClose(BufFile *file)
FileClose(file->files[i]); FileClose(file->files[i]);
/* release the buffer space */ /* release the buffer space */
pfree(file->files); pfree(file->files);
pfree(file->offsets);
pfree(file); pfree(file);
} }
...@@ -422,27 +411,17 @@ BufFileLoadBuffer(BufFile *file) ...@@ -422,27 +411,17 @@ BufFileLoadBuffer(BufFile *file)
file->curOffset = 0L; file->curOffset = 0L;
} }
/*
* May need to reposition physical file.
*/
thisfile = file->files[file->curFile];
if (file->curOffset != file->offsets[file->curFile])
{
if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset)
return; /* seek failed, read nothing */
file->offsets[file->curFile] = file->curOffset;
}
/* /*
* Read whatever we can get, up to a full bufferload. * Read whatever we can get, up to a full bufferload.
*/ */
thisfile = file->files[file->curFile];
file->nbytes = FileRead(thisfile, file->nbytes = FileRead(thisfile,
file->buffer.data, file->buffer.data,
sizeof(file->buffer), sizeof(file->buffer),
file->curOffset,
WAIT_EVENT_BUFFILE_READ); WAIT_EVENT_BUFFILE_READ);
if (file->nbytes < 0) if (file->nbytes < 0)
file->nbytes = 0; file->nbytes = 0;
file->offsets[file->curFile] += file->nbytes;
/* we choose not to advance curOffset here */ /* we choose not to advance curOffset here */
if (file->nbytes > 0) if (file->nbytes > 0)
...@@ -491,23 +470,14 @@ BufFileDumpBuffer(BufFile *file) ...@@ -491,23 +470,14 @@ BufFileDumpBuffer(BufFile *file)
if ((off_t) bytestowrite > availbytes) if ((off_t) bytestowrite > availbytes)
bytestowrite = (int) availbytes; bytestowrite = (int) availbytes;
/*
* May need to reposition physical file.
*/
thisfile = file->files[file->curFile]; thisfile = file->files[file->curFile];
if (file->curOffset != file->offsets[file->curFile])
{
if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset)
return; /* seek failed, give up */
file->offsets[file->curFile] = file->curOffset;
}
bytestowrite = FileWrite(thisfile, bytestowrite = FileWrite(thisfile,
file->buffer.data + wpos, file->buffer.data + wpos,
bytestowrite, bytestowrite,
file->curOffset,
WAIT_EVENT_BUFFILE_WRITE); WAIT_EVENT_BUFFILE_WRITE);
if (bytestowrite <= 0) if (bytestowrite <= 0)
return; /* failed to write */ return; /* failed to write */
file->offsets[file->curFile] += bytestowrite;
file->curOffset += bytestowrite; file->curOffset += bytestowrite;
wpos += bytestowrite; wpos += bytestowrite;
...@@ -803,11 +773,10 @@ BufFileSize(BufFile *file) ...@@ -803,11 +773,10 @@ BufFileSize(BufFile *file)
{ {
off_t lastFileSize; off_t lastFileSize;
/* Get the size of the last physical file by seeking to end. */ /* Get the size of the last physical file. */
lastFileSize = FileSeek(file->files[file->numFiles - 1], 0, SEEK_END); lastFileSize = FileSize(file->files[file->numFiles - 1]);
if (lastFileSize < 0) if (lastFileSize < 0)
return -1; return -1;
file->offsets[file->numFiles - 1] = lastFileSize;
return ((file->numFiles - 1) * (off_t) MAX_PHYSICAL_FILESIZE) + return ((file->numFiles - 1) * (off_t) MAX_PHYSICAL_FILESIZE) +
lastFileSize; lastFileSize;
...@@ -849,13 +818,8 @@ BufFileAppend(BufFile *target, BufFile *source) ...@@ -849,13 +818,8 @@ BufFileAppend(BufFile *target, BufFile *source)
target->files = (File *) target->files = (File *)
repalloc(target->files, sizeof(File) * newNumFiles); repalloc(target->files, sizeof(File) * newNumFiles);
target->offsets = (off_t *)
repalloc(target->offsets, sizeof(off_t) * newNumFiles);
for (i = target->numFiles; i < newNumFiles; i++) for (i = target->numFiles; i < newNumFiles; i++)
{
target->files[i] = source->files[i - target->numFiles]; target->files[i] = source->files[i - target->numFiles];
target->offsets[i] = source->offsets[i - target->numFiles];
}
target->numFiles = newNumFiles; target->numFiles = newNumFiles;
return startBlock; return startBlock;
......
This diff is collapsed.
...@@ -522,22 +522,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ...@@ -522,22 +522,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
/* if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
* Note: because caller usually obtained blocknum by calling mdnblocks,
* which did a seek(SEEK_END), this seek is often redundant and will be
* optimized away by fd.c. It's not redundant, however, if there is a
* partial page at the end of the file. In that case we want to try to
* overwrite the partial page with a full page. It's also not redundant
* if bufmgr.c had to dump another buffer of the same file to make room
* for the new page's buffer.
*/
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not seek to block %u in file \"%s\": %m",
blocknum, FilePathName(v->mdfd_vfd))));
if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
{ {
if (nbytes < 0) if (nbytes < 0)
ereport(ERROR, ereport(ERROR,
...@@ -748,13 +733,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ...@@ -748,13 +733,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ);
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not seek to block %u in file \"%s\": %m",
blocknum, FilePathName(v->mdfd_vfd))));
nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_READ);
TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.spcNode,
...@@ -824,13 +803,7 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ...@@ -824,13 +803,7 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE);
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not seek to block %u in file \"%s\": %m",
blocknum, FilePathName(v->mdfd_vfd))));
nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_WRITE);
TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.spcNode,
...@@ -1979,7 +1952,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) ...@@ -1979,7 +1952,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
{ {
off_t len; off_t len;
len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END); len = FileSize(seg->mdfd_vfd);
if (len < 0) if (len < 0)
ereport(ERROR, ereport(ERROR,
(errcode_for_file_access(), (errcode_for_file_access(),
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
/* /*
* calls: * calls:
* *
* File {Close, Read, Write, Seek, Tell, Sync} * File {Close, Read, Write, Size, Sync}
* {Path Name Open, Allocate, Free} File * {Path Name Open, Allocate, Free} File
* *
* These are NOT JUST RENAMINGS OF THE UNIX ROUTINES. * These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
...@@ -42,10 +42,6 @@ ...@@ -42,10 +42,6 @@
#include <dirent.h> #include <dirent.h>
/*
* FileSeek uses the standard UNIX lseek(2) flags.
*/
typedef int File; typedef int File;
...@@ -68,10 +64,10 @@ extern File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fil ...@@ -68,10 +64,10 @@ extern File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fil
extern File OpenTemporaryFile(bool interXact); extern File OpenTemporaryFile(bool interXact);
extern void FileClose(File file); extern void FileClose(File file);
extern int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info); extern int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info);
extern int FileRead(File file, char *buffer, int amount, uint32 wait_event_info); extern int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info);
extern int FileWrite(File file, char *buffer, int amount, uint32 wait_event_info); extern int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info);
extern int FileSync(File file, uint32 wait_event_info); extern int FileSync(File file, uint32 wait_event_info);
extern off_t FileSeek(File file, off_t offset, int whence); extern off_t FileSize(File file);
extern int FileTruncate(File file, off_t offset, uint32 wait_event_info); extern int FileTruncate(File file, off_t offset, uint32 wait_event_info);
extern void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info); extern void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info);
extern char *FilePathName(File file); extern char *FilePathName(File file);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment