Commit 3d475515 authored by Tom Lane's avatar Tom Lane

Account explicitly for long-lived FDs that are allocated outside fd.c.

The comments in fd.c have long claimed that all file allocations should
go through that module, but in reality that's not always practical.
fd.c doesn't supply APIs for invoking some FD-producing syscalls like
pipe() or epoll_create(); and the APIs it does supply for non-virtual
FDs are mostly insistent on releasing those FDs at transaction end;
and in some cases the actual open() call is in code that can't be made
to use fd.c, such as libpq.

This has led to a situation where, in a modern server, there are likely
to be seven or so long-lived FDs per backend process that are not known
to fd.c.  Since NUM_RESERVED_FDS is only 10, that meant we had *very*
few spare FDs if max_files_per_process is >= the system ulimit and
fd.c had opened all the files it thought it safely could.  The
contrib/postgres_fdw regression test, in particular, could easily be
made to fall over by running it under a restrictive ulimit.

To improve matters, invent functions Acquire/Reserve/ReleaseExternalFD
that allow outside callers to tell fd.c that they have or want to allocate
a FD that's not directly managed by fd.c.  Add calls to track all the
fixed FDs in a standard backend session, so that we are honestly
guaranteeing that NUM_RESERVED_FDS FDs remain unused below the EMFILE
limit in a backend's idle state.  The coding rules for these functions say
that there's no need to call them in code that just allocates one FD over
a fairly short interval; we can dip into NUM_RESERVED_FDS for such cases.
That means that there aren't all that many places where we need to worry.
But postgres_fdw and dblink must use this facility to account for
long-lived FDs consumed by libpq connections.  There may be other places
where it's worth doing such accounting, too, but this seems like enough
to solve the immediate problem.

Internally to fd.c, "external" FDs are limited to max_safe_fds/3 FDs.
(Callers can choose to ignore this limit, but of course it's unwise
to do so except for fixed file allocations.)  I also reduced the limit
on "allocated" files to max_safe_fds/3 FDs (it had been max_safe_fds/2).
Conceivably a smarter rule could be used here --- but in practice,
on reasonable systems, max_safe_fds should be large enough that this
isn't much of an issue, so KISS for now.  To avoid possible regression
in the number of external or allocated files that can be opened,
increase FD_MINFREE and the lower limit on max_files_per_process a
little bit; we now insist that the effective "ulimit -n" be at least 64.

This seems like pretty clearly a bug fix, but in view of the lack of
field complaints, I'll refrain from risking a back-patch.

Discussion: https://postgr.es/m/E1izCmM-0005pV-Co@gemulon.postgresql.org
parent 1420617b
...@@ -200,12 +200,33 @@ dblink_get_conn(char *conname_or_str, ...@@ -200,12 +200,33 @@ dblink_get_conn(char *conname_or_str,
if (connstr == NULL) if (connstr == NULL)
connstr = conname_or_str; connstr = conname_or_str;
dblink_connstr_check(connstr); dblink_connstr_check(connstr);
/*
* We must obey fd.c's limit on non-virtual file descriptors. Assume
* that a PGconn represents one long-lived FD. (Doing this here also
* ensures that VFDs are closed if needed to make room.)
*/
if (!AcquireExternalFD())
ereport(ERROR,
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
errmsg("could not establish connection"),
errdetail("There are too many open files on the local server."),
#ifndef WIN32
errhint("Raise the server's max_files_per_process and/or \"ulimit -n\" limits.")
#else
errhint("Raise the server's max_files_per_process setting.")
#endif
));
/* OK to make connection */
conn = PQconnectdb(connstr); conn = PQconnectdb(connstr);
if (PQstatus(conn) == CONNECTION_BAD) if (PQstatus(conn) == CONNECTION_BAD)
{ {
char *msg = pchomp(PQerrorMessage(conn)); char *msg = pchomp(PQerrorMessage(conn));
PQfinish(conn); PQfinish(conn);
ReleaseExternalFD();
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
errmsg("could not establish connection"), errmsg("could not establish connection"),
...@@ -282,12 +303,32 @@ dblink_connect(PG_FUNCTION_ARGS) ...@@ -282,12 +303,32 @@ dblink_connect(PG_FUNCTION_ARGS)
/* check password in connection string if not superuser */ /* check password in connection string if not superuser */
dblink_connstr_check(connstr); dblink_connstr_check(connstr);
/*
* We must obey fd.c's limit on non-virtual file descriptors. Assume that
* a PGconn represents one long-lived FD. (Doing this here also ensures
* that VFDs are closed if needed to make room.)
*/
if (!AcquireExternalFD())
ereport(ERROR,
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
errmsg("could not establish connection"),
errdetail("There are too many open files on the local server."),
#ifndef WIN32
errhint("Raise the server's max_files_per_process and/or \"ulimit -n\" limits.")
#else
errhint("Raise the server's max_files_per_process setting.")
#endif
));
/* OK to make connection */
conn = PQconnectdb(connstr); conn = PQconnectdb(connstr);
if (PQstatus(conn) == CONNECTION_BAD) if (PQstatus(conn) == CONNECTION_BAD)
{ {
msg = pchomp(PQerrorMessage(conn)); msg = pchomp(PQerrorMessage(conn));
PQfinish(conn); PQfinish(conn);
ReleaseExternalFD();
if (rconn) if (rconn)
pfree(rconn); pfree(rconn);
...@@ -312,7 +353,10 @@ dblink_connect(PG_FUNCTION_ARGS) ...@@ -312,7 +353,10 @@ dblink_connect(PG_FUNCTION_ARGS)
else else
{ {
if (pconn->conn) if (pconn->conn)
{
PQfinish(pconn->conn); PQfinish(pconn->conn);
ReleaseExternalFD();
}
pconn->conn = conn; pconn->conn = conn;
} }
...@@ -346,6 +390,7 @@ dblink_disconnect(PG_FUNCTION_ARGS) ...@@ -346,6 +390,7 @@ dblink_disconnect(PG_FUNCTION_ARGS)
dblink_conn_not_avail(conname); dblink_conn_not_avail(conname);
PQfinish(conn); PQfinish(conn);
ReleaseExternalFD();
if (rconn) if (rconn)
{ {
deleteConnection(conname); deleteConnection(conname);
...@@ -780,7 +825,10 @@ dblink_record_internal(FunctionCallInfo fcinfo, bool is_async) ...@@ -780,7 +825,10 @@ dblink_record_internal(FunctionCallInfo fcinfo, bool is_async)
{ {
/* if needed, close the connection to the database */ /* if needed, close the connection to the database */
if (freeconn) if (freeconn)
{
PQfinish(conn); PQfinish(conn);
ReleaseExternalFD();
}
} }
PG_END_TRY(); PG_END_TRY();
...@@ -1458,7 +1506,10 @@ dblink_exec(PG_FUNCTION_ARGS) ...@@ -1458,7 +1506,10 @@ dblink_exec(PG_FUNCTION_ARGS)
{ {
/* if needed, close the connection to the database */ /* if needed, close the connection to the database */
if (freeconn) if (freeconn)
{
PQfinish(conn); PQfinish(conn);
ReleaseExternalFD();
}
} }
PG_END_TRY(); PG_END_TRY();
...@@ -2563,6 +2614,7 @@ createNewConnection(const char *name, remoteConn *rconn) ...@@ -2563,6 +2614,7 @@ createNewConnection(const char *name, remoteConn *rconn)
if (found) if (found)
{ {
PQfinish(rconn->conn); PQfinish(rconn->conn);
ReleaseExternalFD();
pfree(rconn); pfree(rconn);
ereport(ERROR, ereport(ERROR,
...@@ -2604,6 +2656,7 @@ dblink_security_check(PGconn *conn, remoteConn *rconn) ...@@ -2604,6 +2656,7 @@ dblink_security_check(PGconn *conn, remoteConn *rconn)
if (!PQconnectionUsedPassword(conn)) if (!PQconnectionUsedPassword(conn))
{ {
PQfinish(conn); PQfinish(conn);
ReleaseExternalFD();
if (rconn) if (rconn)
pfree(rconn); pfree(rconn);
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "miscadmin.h" #include "miscadmin.h"
#include "pgstat.h" #include "pgstat.h"
#include "postgres_fdw.h" #include "postgres_fdw.h"
#include "storage/fd.h"
#include "storage/latch.h" #include "storage/latch.h"
#include "utils/hsearch.h" #include "utils/hsearch.h"
#include "utils/inval.h" #include "utils/inval.h"
...@@ -259,10 +260,33 @@ connect_pg_server(ForeignServer *server, UserMapping *user) ...@@ -259,10 +260,33 @@ connect_pg_server(ForeignServer *server, UserMapping *user)
keywords[n] = values[n] = NULL; keywords[n] = values[n] = NULL;
/* verify connection parameters and make connection */ /* verify the set of connection parameters */
check_conn_params(keywords, values, user); check_conn_params(keywords, values, user);
/*
* We must obey fd.c's limit on non-virtual file descriptors. Assume
* that a PGconn represents one long-lived FD. (Doing this here also
* ensures that VFDs are closed if needed to make room.)
*/
if (!AcquireExternalFD())
ereport(ERROR,
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
errmsg("could not connect to server \"%s\"",
server->servername),
errdetail("There are too many open files on the local server."),
#ifndef WIN32
errhint("Raise the server's max_files_per_process and/or \"ulimit -n\" limits.")
#else
errhint("Raise the server's max_files_per_process setting.")
#endif
));
/* OK to make connection */
conn = PQconnectdbParams(keywords, values, false); conn = PQconnectdbParams(keywords, values, false);
if (!conn)
ReleaseExternalFD(); /* because the PG_CATCH block won't */
if (!conn || PQstatus(conn) != CONNECTION_OK) if (!conn || PQstatus(conn) != CONNECTION_OK)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
...@@ -294,7 +318,10 @@ connect_pg_server(ForeignServer *server, UserMapping *user) ...@@ -294,7 +318,10 @@ connect_pg_server(ForeignServer *server, UserMapping *user)
{ {
/* Release PGconn data structure if we managed to create one */ /* Release PGconn data structure if we managed to create one */
if (conn) if (conn)
{
PQfinish(conn); PQfinish(conn);
ReleaseExternalFD();
}
PG_RE_THROW(); PG_RE_THROW();
} }
PG_END_TRY(); PG_END_TRY();
...@@ -312,6 +339,7 @@ disconnect_pg_server(ConnCacheEntry *entry) ...@@ -312,6 +339,7 @@ disconnect_pg_server(ConnCacheEntry *entry)
{ {
PQfinish(entry->conn); PQfinish(entry->conn);
entry->conn = NULL; entry->conn = NULL;
ReleaseExternalFD();
} }
} }
......
...@@ -774,6 +774,7 @@ static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "strea ...@@ -774,6 +774,7 @@ static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "strea
* openLogFile is -1 or a kernel FD for an open log file segment. * openLogFile is -1 or a kernel FD for an open log file segment.
* openLogSegNo identifies the segment. These variables are only used to * openLogSegNo identifies the segment. These variables are only used to
* write the XLOG, and so will normally refer to the active segment. * write the XLOG, and so will normally refer to the active segment.
* Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
*/ */
static int openLogFile = -1; static int openLogFile = -1;
static XLogSegNo openLogSegNo = 0; static XLogSegNo openLogSegNo = 0;
...@@ -785,6 +786,9 @@ static XLogSegNo openLogSegNo = 0; ...@@ -785,6 +786,9 @@ static XLogSegNo openLogSegNo = 0;
* will be just past that page. readLen indicates how much of the current * will be just past that page. readLen indicates how much of the current
* page has been read into readBuf, and readSource indicates where we got * page has been read into readBuf, and readSource indicates where we got
* the currently open file from. * the currently open file from.
* Note: we could use Reserve/ReleaseExternalFD to track consumption of
* this FD too; but it doesn't currently seem worthwhile, since the XLOG is
* not read by general-purpose sessions.
*/ */
static int readFile = -1; static int readFile = -1;
static XLogSegNo readSegNo = 0; static XLogSegNo readSegNo = 0;
...@@ -2447,6 +2451,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) ...@@ -2447,6 +2451,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
/* create/use new log file */ /* create/use new log file */
use_existent = true; use_existent = true;
openLogFile = XLogFileInit(openLogSegNo, &use_existent, true); openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
ReserveExternalFD();
} }
/* Make sure we have the current logfile open */ /* Make sure we have the current logfile open */
...@@ -2455,6 +2460,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) ...@@ -2455,6 +2460,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo, XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
wal_segment_size); wal_segment_size);
openLogFile = XLogFileOpen(openLogSegNo); openLogFile = XLogFileOpen(openLogSegNo);
ReserveExternalFD();
} }
/* Add current page to the set of pending pages-to-dump */ /* Add current page to the set of pending pages-to-dump */
...@@ -2605,6 +2611,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) ...@@ -2605,6 +2611,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo, XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
wal_segment_size); wal_segment_size);
openLogFile = XLogFileOpen(openLogSegNo); openLogFile = XLogFileOpen(openLogSegNo);
ReserveExternalFD();
} }
issue_xlog_fsync(openLogFile, openLogSegNo); issue_xlog_fsync(openLogFile, openLogSegNo);
...@@ -3811,6 +3818,7 @@ XLogFileClose(void) ...@@ -3811,6 +3818,7 @@ XLogFileClose(void)
} }
openLogFile = -1; openLogFile = -1;
ReleaseExternalFD();
} }
/* /*
...@@ -5224,6 +5232,11 @@ BootStrapXLOG(void) ...@@ -5224,6 +5232,11 @@ BootStrapXLOG(void)
use_existent = false; use_existent = false;
openLogFile = XLogFileInit(1, &use_existent, false); openLogFile = XLogFileInit(1, &use_existent, false);
/*
* We needn't bother with Reserve/ReleaseExternalFD here, since we'll
* close the file again in a moment.
*/
/* Write the first page with the initial record */ /* Write the first page with the initial record */
errno = 0; errno = 0;
pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE); pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
......
...@@ -602,6 +602,9 @@ retry2: ...@@ -602,6 +602,9 @@ retry2:
pg_freeaddrinfo_all(hints.ai_family, addrs); pg_freeaddrinfo_all(hints.ai_family, addrs);
/* Now that we have a long-lived socket, tell fd.c about it. */
ReserveExternalFD();
return; return;
startup_failed: startup_failed:
......
...@@ -2554,9 +2554,14 @@ ClosePostmasterPorts(bool am_syslogger) ...@@ -2554,9 +2554,14 @@ ClosePostmasterPorts(bool am_syslogger)
(errcode_for_file_access(), (errcode_for_file_access(),
errmsg_internal("could not close postmaster death monitoring pipe in child process: %m"))); errmsg_internal("could not close postmaster death monitoring pipe in child process: %m")));
postmaster_alive_fds[POSTMASTER_FD_OWN] = -1; postmaster_alive_fds[POSTMASTER_FD_OWN] = -1;
/* Notify fd.c that we released one pipe FD. */
ReleaseExternalFD();
#endif #endif
/* Close the listen sockets */ /*
* Close the postmaster's listen sockets. These aren't tracked by fd.c,
* so we don't call ReleaseExternalFD() here.
*/
for (i = 0; i < MAXLISTEN; i++) for (i = 0; i < MAXLISTEN; i++)
{ {
if (ListenSocket[i] != PGINVALID_SOCKET) if (ListenSocket[i] != PGINVALID_SOCKET)
...@@ -2566,7 +2571,10 @@ ClosePostmasterPorts(bool am_syslogger) ...@@ -2566,7 +2571,10 @@ ClosePostmasterPorts(bool am_syslogger)
} }
} }
/* If using syslogger, close the read side of the pipe */ /*
* If using syslogger, close the read side of the pipe. We don't bother
* tracking this in fd.c, either.
*/
if (!am_syslogger) if (!am_syslogger)
{ {
#ifndef WIN32 #ifndef WIN32
...@@ -4279,6 +4287,9 @@ BackendInitialize(Port *port) ...@@ -4279,6 +4287,9 @@ BackendInitialize(Port *port)
/* Save port etc. for ps status */ /* Save port etc. for ps status */
MyProcPort = port; MyProcPort = port;
/* Tell fd.c about the long-lived FD associated with the port */
ReserveExternalFD();
/* /*
* PreAuthDelay is a debugging aid for investigating problems in the * PreAuthDelay is a debugging aid for investigating problems in the
* authentication cycle: it can be set in postgresql.conf to allow time to * authentication cycle: it can be set in postgresql.conf to allow time to
...@@ -6442,6 +6453,20 @@ restore_backend_variables(BackendParameters *param, Port *port) ...@@ -6442,6 +6453,20 @@ restore_backend_variables(BackendParameters *param, Port *port)
strlcpy(pkglib_path, param->pkglib_path, MAXPGPATH); strlcpy(pkglib_path, param->pkglib_path, MAXPGPATH);
strlcpy(ExtraOptions, param->ExtraOptions, MAXPGPATH); strlcpy(ExtraOptions, param->ExtraOptions, MAXPGPATH);
/*
* We need to restore fd.c's counts of externally-opened FDs; to avoid
* confusion, be sure to do this after restoring max_safe_fds. (Note:
* BackendInitialize will handle this for port->sock.)
*/
#ifndef WIN32
if (postmaster_alive_fds[0] >= 0)
ReserveExternalFD();
if (postmaster_alive_fds[1] >= 0)
ReserveExternalFD();
#endif
if (pgStatSock != PGINVALID_SOCKET)
ReserveExternalFD();
} }
...@@ -6584,6 +6609,10 @@ InitPostmasterDeathWatchHandle(void) ...@@ -6584,6 +6609,10 @@ InitPostmasterDeathWatchHandle(void)
(errcode_for_file_access(), (errcode_for_file_access(),
errmsg_internal("could not create pipe to monitor postmaster death: %m"))); errmsg_internal("could not create pipe to monitor postmaster death: %m")));
/* Notify fd.c that we've eaten two FDs for the pipe. */
ReserveExternalFD();
ReserveExternalFD();
/* /*
* Set O_NONBLOCK to allow testing for the fd's presence with a read() * Set O_NONBLOCK to allow testing for the fd's presence with a read()
* call. * call.
......
...@@ -562,6 +562,11 @@ SysLogger_Start(void) ...@@ -562,6 +562,11 @@ SysLogger_Start(void)
* This means the postmaster must continue to hold the read end of the * This means the postmaster must continue to hold the read end of the
* pipe open, so we can pass it down to the reincarnated syslogger. This * pipe open, so we can pass it down to the reincarnated syslogger. This
* is a bit klugy but we have little choice. * is a bit klugy but we have little choice.
*
* Also note that we don't bother counting the pipe FDs by calling
* Reserve/ReleaseExternalFD. There's no real need to account for them
* accurately in the postmaster or syslogger process, and both ends of the
* pipe will wind up closed in all other postmaster children.
*/ */
#ifndef WIN32 #ifndef WIN32
if (syslogPipe[0] < 0) if (syslogPipe[0] < 0)
......
...@@ -61,6 +61,12 @@ ...@@ -61,6 +61,12 @@
* BasicOpenFile, it is solely the caller's responsibility to close the file * BasicOpenFile, it is solely the caller's responsibility to close the file
* descriptor by calling close(2). * descriptor by calling close(2).
* *
* If a non-virtual file descriptor needs to be held open for any length of
* time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
* (and eventually ReleaseExternalFD), so that we can take it into account
* while deciding how many VFDs can be open. This applies to FDs obtained
* with BasicOpenFile as well as those obtained without use of any fd.c API.
*
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -103,8 +109,8 @@ ...@@ -103,8 +109,8 @@
/* /*
* We must leave some file descriptors free for system(), the dynamic loader, * We must leave some file descriptors free for system(), the dynamic loader,
* and other code that tries to open files without consulting fd.c. This * and other code that tries to open files without consulting fd.c. This
* is the number left free. (While we can be pretty sure we won't get * is the number left free. (While we try fairly hard to prevent EMFILE
* EMFILE, there's never any guarantee that we won't get ENFILE due to * errors, there's never any guarantee that we won't get ENFILE due to
* other processes chewing up FDs. So it's a bad idea to try to open files * other processes chewing up FDs. So it's a bad idea to try to open files
* without consulting fd.c. Nonetheless we cannot control all code.) * without consulting fd.c. Nonetheless we cannot control all code.)
* *
...@@ -119,9 +125,12 @@ ...@@ -119,9 +125,12 @@
/* /*
* If we have fewer than this many usable FDs after allowing for the reserved * If we have fewer than this many usable FDs after allowing for the reserved
* ones, choke. * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
* much less than that. Note that this value ensures numExternalFDs can be
* at least 16; as of this writing, the contrib/postgres_fdw regression tests
* will not pass unless that can grow to at least 14.)
*/ */
#define FD_MINFREE 10 #define FD_MINFREE 48
/* /*
* A number of platforms allow individual processes to open many more files * A number of platforms allow individual processes to open many more files
...@@ -132,8 +141,8 @@ ...@@ -132,8 +141,8 @@
int max_files_per_process = 1000; int max_files_per_process = 1000;
/* /*
* Maximum number of file descriptors to open for either VFD entries or * Maximum number of file descriptors to open for operations that fd.c knows
* AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
* to a conservative value, and remains that way indefinitely in bootstrap or * to a conservative value, and remains that way indefinitely in bootstrap or
* standalone-backend cases. In normal postmaster operation, the postmaster * standalone-backend cases. In normal postmaster operation, the postmaster
* calls set_max_safe_fds() late in initialization to update the value, and * calls set_max_safe_fds() late in initialization to update the value, and
...@@ -142,7 +151,7 @@ int max_files_per_process = 1000; ...@@ -142,7 +151,7 @@ int max_files_per_process = 1000;
* Note: the value of max_files_per_process is taken into account while * Note: the value of max_files_per_process is taken into account while
* setting this variable, and so need not be tested separately. * setting this variable, and so need not be tested separately.
*/ */
int max_safe_fds = 32; /* default if not changed */ int max_safe_fds = FD_MINFREE; /* default if not changed */
/* Whether it is safe to continue running after fsync() fails. */ /* Whether it is safe to continue running after fsync() fails. */
bool data_sync_retry = false; bool data_sync_retry = false;
...@@ -243,6 +252,11 @@ static int numAllocatedDescs = 0; ...@@ -243,6 +252,11 @@ static int numAllocatedDescs = 0;
static int maxAllocatedDescs = 0; static int maxAllocatedDescs = 0;
static AllocateDesc *allocatedDescs = NULL; static AllocateDesc *allocatedDescs = NULL;
/*
* Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
*/
static int numExternalFDs = 0;
/* /*
* Number of temporary files opened during the current session; * Number of temporary files opened during the current session;
* this is used in generation of tempfile names. * this is used in generation of tempfile names.
...@@ -1025,6 +1039,80 @@ tryAgain: ...@@ -1025,6 +1039,80 @@ tryAgain:
return -1; /* failure */ return -1; /* failure */
} }
/*
* AcquireExternalFD - attempt to reserve an external file descriptor
*
* This should be used by callers that need to hold a file descriptor open
* over more than a short interval, but cannot use any of the other facilities
* provided by this module.
*
* The difference between this and the underlying ReserveExternalFD function
* is that this will report failure (by setting errno and returning false)
* if "too many" external FDs are already reserved. This should be used in
* any code where the total number of FDs to be reserved is not predictable
* and small.
*/
bool
AcquireExternalFD(void)
{
/*
* We don't want more than max_safe_fds / 3 FDs to be consumed for
* "external" FDs.
*/
if (numExternalFDs < max_safe_fds / 3)
{
ReserveExternalFD();
return true;
}
errno = EMFILE;
return false;
}
/*
* ReserveExternalFD - report external consumption of a file descriptor
*
* This should be used by callers that need to hold a file descriptor open
* over more than a short interval, but cannot use any of the other facilities
* provided by this module. This just tracks the use of the FD and closes
* VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
*
* Call this directly only in code where failure to reserve the FD would be
* fatal; for example, the WAL-writing code does so, since the alternative is
* session failure. Also, it's very unwise to do so in code that could
* consume more than one FD per process.
*
* Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
* available, it doesn't matter too much whether this is called before or
* after actually opening the FD; but doing so beforehand reduces the risk of
* an EMFILE failure if not everybody played nice. In any case, it's solely
* caller's responsibility to keep the external-FD count in sync with reality.
*/
void
ReserveExternalFD(void)
{
/*
* Release VFDs if needed to stay safe. Because we do this before
* incrementing numExternalFDs, the final state will be as desired, i.e.,
* nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
*/
ReleaseLruFiles();
numExternalFDs++;
}
/*
* ReleaseExternalFD - report release of an external file descriptor
*
* This is guaranteed not to change errno, so it can be used in failure paths.
*/
void
ReleaseExternalFD(void)
{
Assert(numExternalFDs > 0);
numExternalFDs--;
}
#if defined(FDDEBUG) #if defined(FDDEBUG)
static void static void
...@@ -1185,7 +1273,7 @@ ReleaseLruFile(void) ...@@ -1185,7 +1273,7 @@ ReleaseLruFile(void)
static void static void
ReleaseLruFiles(void) ReleaseLruFiles(void)
{ {
while (nfile + numAllocatedDescs >= max_safe_fds) while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
{ {
if (!ReleaseLruFile()) if (!ReleaseLruFile())
break; break;
...@@ -2176,13 +2264,13 @@ reserveAllocatedDesc(void) ...@@ -2176,13 +2264,13 @@ reserveAllocatedDesc(void)
/* /*
* If the array hasn't yet been created in the current process, initialize * If the array hasn't yet been created in the current process, initialize
* it with FD_MINFREE / 2 elements. In many scenarios this is as many as * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
* we will ever need, anyway. We don't want to look at max_safe_fds * we will ever need, anyway. We don't want to look at max_safe_fds
* immediately because set_max_safe_fds() may not have run yet. * immediately because set_max_safe_fds() may not have run yet.
*/ */
if (allocatedDescs == NULL) if (allocatedDescs == NULL)
{ {
newMax = FD_MINFREE / 2; newMax = FD_MINFREE / 3;
newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc)); newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
/* Out of memory already? Treat as fatal error. */ /* Out of memory already? Treat as fatal error. */
if (newDescs == NULL) if (newDescs == NULL)
...@@ -2200,10 +2288,12 @@ reserveAllocatedDesc(void) ...@@ -2200,10 +2288,12 @@ reserveAllocatedDesc(void)
* *
* We mustn't let allocated descriptors hog all the available FDs, and in * We mustn't let allocated descriptors hog all the available FDs, and in
* practice we'd better leave a reasonable number of FDs for VFD use. So * practice we'd better leave a reasonable number of FDs for VFD use. So
* set the maximum to max_safe_fds / 2. (This should certainly be at * set the maximum to max_safe_fds / 3. (This should certainly be at
* least as large as the initial size, FD_MINFREE / 2.) * least as large as the initial size, FD_MINFREE / 3, so we aren't
* tightening the restriction here.) Recall that "external" FDs are
* allowed to consume another third of max_safe_fds.
*/ */
newMax = max_safe_fds / 2; newMax = max_safe_fds / 3;
if (newMax > maxAllocatedDescs) if (newMax > maxAllocatedDescs)
{ {
newDescs = (AllocateDesc *) realloc(allocatedDescs, newDescs = (AllocateDesc *) realloc(allocatedDescs,
......
...@@ -247,14 +247,17 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, ...@@ -247,14 +247,17 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
/* /*
* Create new segment or open an existing one for attach. * Create new segment or open an existing one for attach.
* *
* Even though we're not going through fd.c, we should be safe against * Even though we will close the FD before returning, it seems desirable
* running out of file descriptors, because of NUM_RESERVED_FDS. We're * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
* only opening one extra descriptor here, and we'll close it before * failure. The fact that we won't hold the FD open long justifies using
* returning. * ReserveExternalFD rather than AcquireExternalFD, though.
*/ */
ReserveExternalFD();
flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0); flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1) if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
{ {
ReleaseExternalFD();
if (errno != EEXIST) if (errno != EEXIST)
ereport(elevel, ereport(elevel,
(errcode_for_dynamic_shared_memory(), (errcode_for_dynamic_shared_memory(),
...@@ -278,6 +281,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, ...@@ -278,6 +281,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
/* Back out what's already been done. */ /* Back out what's already been done. */
save_errno = errno; save_errno = errno;
close(fd); close(fd);
ReleaseExternalFD();
errno = save_errno; errno = save_errno;
ereport(elevel, ereport(elevel,
...@@ -295,6 +299,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, ...@@ -295,6 +299,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
/* Back out what's already been done. */ /* Back out what's already been done. */
save_errno = errno; save_errno = errno;
close(fd); close(fd);
ReleaseExternalFD();
shm_unlink(name); shm_unlink(name);
errno = save_errno; errno = save_errno;
...@@ -323,6 +328,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, ...@@ -323,6 +328,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
/* Back out what's already been done. */ /* Back out what's already been done. */
save_errno = errno; save_errno = errno;
close(fd); close(fd);
ReleaseExternalFD();
if (op == DSM_OP_CREATE) if (op == DSM_OP_CREATE)
shm_unlink(name); shm_unlink(name);
errno = save_errno; errno = save_errno;
...@@ -336,6 +342,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, ...@@ -336,6 +342,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
*mapped_address = address; *mapped_address = address;
*mapped_size = request_size; *mapped_size = request_size;
close(fd); close(fd);
ReleaseExternalFD();
return true; return true;
} }
......
...@@ -51,6 +51,7 @@ ...@@ -51,6 +51,7 @@
#include "port/atomics.h" #include "port/atomics.h"
#include "portability/instr_time.h" #include "portability/instr_time.h"
#include "postmaster/postmaster.h" #include "postmaster/postmaster.h"
#include "storage/fd.h"
#include "storage/ipc.h" #include "storage/ipc.h"
#include "storage/latch.h" #include "storage/latch.h"
#include "storage/pmsignal.h" #include "storage/pmsignal.h"
...@@ -187,6 +188,9 @@ InitializeLatchSupport(void) ...@@ -187,6 +188,9 @@ InitializeLatchSupport(void)
/* Clean up, just for safety's sake; we'll set these below */ /* Clean up, just for safety's sake; we'll set these below */
selfpipe_readfd = selfpipe_writefd = -1; selfpipe_readfd = selfpipe_writefd = -1;
selfpipe_owner_pid = 0; selfpipe_owner_pid = 0;
/* Keep fd.c's accounting straight */
ReleaseExternalFD();
ReleaseExternalFD();
} }
else else
{ {
...@@ -194,6 +198,7 @@ InitializeLatchSupport(void) ...@@ -194,6 +198,7 @@ InitializeLatchSupport(void)
* Postmaster didn't create a self-pipe ... or else we're in an * Postmaster didn't create a self-pipe ... or else we're in an
* EXEC_BACKEND build, in which case it doesn't matter since the * EXEC_BACKEND build, in which case it doesn't matter since the
* postmaster's pipe FDs were closed by the action of FD_CLOEXEC. * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
* fd.c won't have state to clean up, either.
*/ */
Assert(selfpipe_readfd == -1); Assert(selfpipe_readfd == -1);
} }
...@@ -228,6 +233,10 @@ InitializeLatchSupport(void) ...@@ -228,6 +233,10 @@ InitializeLatchSupport(void)
selfpipe_readfd = pipefd[0]; selfpipe_readfd = pipefd[0];
selfpipe_writefd = pipefd[1]; selfpipe_writefd = pipefd[1];
selfpipe_owner_pid = MyProcPid; selfpipe_owner_pid = MyProcPid;
/* Tell fd.c about these two long-lived FDs */
ReserveExternalFD();
ReserveExternalFD();
#else #else
/* currently, nothing to do here for Windows */ /* currently, nothing to do here for Windows */
#endif #endif
...@@ -604,24 +613,57 @@ CreateWaitEventSet(MemoryContext context, int nevents) ...@@ -604,24 +613,57 @@ CreateWaitEventSet(MemoryContext context, int nevents)
set->exit_on_postmaster_death = false; set->exit_on_postmaster_death = false;
#if defined(WAIT_USE_EPOLL) #if defined(WAIT_USE_EPOLL)
if (!AcquireExternalFD())
{
/* treat this as though epoll_create1 itself returned EMFILE */
elog(ERROR, "epoll_create1 failed: %m");
}
#ifdef EPOLL_CLOEXEC #ifdef EPOLL_CLOEXEC
set->epoll_fd = epoll_create1(EPOLL_CLOEXEC); set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
if (set->epoll_fd < 0) if (set->epoll_fd < 0)
{
ReleaseExternalFD();
elog(ERROR, "epoll_create1 failed: %m"); elog(ERROR, "epoll_create1 failed: %m");
}
#else #else
/* cope with ancient glibc lacking epoll_create1 (e.g., RHEL5) */ /* cope with ancient glibc lacking epoll_create1 (e.g., RHEL5) */
set->epoll_fd = epoll_create(nevents); set->epoll_fd = epoll_create(nevents);
if (set->epoll_fd < 0) if (set->epoll_fd < 0)
{
ReleaseExternalFD();
elog(ERROR, "epoll_create failed: %m"); elog(ERROR, "epoll_create failed: %m");
}
if (fcntl(set->epoll_fd, F_SETFD, FD_CLOEXEC) == -1) if (fcntl(set->epoll_fd, F_SETFD, FD_CLOEXEC) == -1)
{
int save_errno = errno;
close(set->epoll_fd);
ReleaseExternalFD();
errno = save_errno;
elog(ERROR, "fcntl(F_SETFD) failed on epoll descriptor: %m"); elog(ERROR, "fcntl(F_SETFD) failed on epoll descriptor: %m");
}
#endif /* EPOLL_CLOEXEC */ #endif /* EPOLL_CLOEXEC */
#elif defined(WAIT_USE_KQUEUE) #elif defined(WAIT_USE_KQUEUE)
if (!AcquireExternalFD())
{
/* treat this as though kqueue itself returned EMFILE */
elog(ERROR, "kqueue failed: %m");
}
set->kqueue_fd = kqueue(); set->kqueue_fd = kqueue();
if (set->kqueue_fd < 0) if (set->kqueue_fd < 0)
{
ReleaseExternalFD();
elog(ERROR, "kqueue failed: %m"); elog(ERROR, "kqueue failed: %m");
}
if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1) if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1)
{
int save_errno = errno;
close(set->kqueue_fd);
ReleaseExternalFD();
errno = save_errno;
elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m"); elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m");
}
set->report_postmaster_not_running = false; set->report_postmaster_not_running = false;
#elif defined(WAIT_USE_WIN32) #elif defined(WAIT_USE_WIN32)
...@@ -655,8 +697,10 @@ FreeWaitEventSet(WaitEventSet *set) ...@@ -655,8 +697,10 @@ FreeWaitEventSet(WaitEventSet *set)
{ {
#if defined(WAIT_USE_EPOLL) #if defined(WAIT_USE_EPOLL)
close(set->epoll_fd); close(set->epoll_fd);
ReleaseExternalFD();
#elif defined(WAIT_USE_KQUEUE) #elif defined(WAIT_USE_KQUEUE)
close(set->kqueue_fd); close(set->kqueue_fd);
ReleaseExternalFD();
#elif defined(WAIT_USE_WIN32) #elif defined(WAIT_USE_WIN32)
WaitEvent *cur_event; WaitEvent *cur_event;
......
...@@ -2431,7 +2431,7 @@ static struct config_int ConfigureNamesInt[] = ...@@ -2431,7 +2431,7 @@ static struct config_int ConfigureNamesInt[] =
NULL NULL
}, },
&max_files_per_process, &max_files_per_process,
1000, 25, INT_MAX, 1000, 64, INT_MAX,
NULL, NULL, NULL NULL, NULL, NULL
}, },
......
...@@ -153,7 +153,7 @@ ...@@ -153,7 +153,7 @@
# - Kernel Resources - # - Kernel Resources -
#max_files_per_process = 1000 # min 25 #max_files_per_process = 1000 # min 64
# (change requires restart) # (change requires restart)
# - Cost-Based Vacuum Delay - # - Cost-Based Vacuum Delay -
......
...@@ -35,6 +35,10 @@ ...@@ -35,6 +35,10 @@
* Likewise, use AllocateDir/FreeDir, not opendir/closedir, to allocate * Likewise, use AllocateDir/FreeDir, not opendir/closedir, to allocate
* open directories (DIR*), and OpenTransientFile/CloseTransientFile for an * open directories (DIR*), and OpenTransientFile/CloseTransientFile for an
* unbuffered file descriptor. * unbuffered file descriptor.
*
* If you really can't use any of the above, at least call AcquireExternalFD
* or ReserveExternalFD to report any file descriptors that are held for any
* length of time. Failure to do so risks unnecessary EMFILE errors.
*/ */
#ifndef FD_H #ifndef FD_H
#define FD_H #define FD_H
...@@ -120,7 +124,12 @@ extern int CloseTransientFile(int fd); ...@@ -120,7 +124,12 @@ extern int CloseTransientFile(int fd);
extern int BasicOpenFile(const char *fileName, int fileFlags); extern int BasicOpenFile(const char *fileName, int fileFlags);
extern int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode); extern int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode);
/* Make a directory with default permissions */ /* Use these for other cases, and also for long-lived BasicOpenFile FDs */
extern bool AcquireExternalFD(void);
extern void ReserveExternalFD(void);
extern void ReleaseExternalFD(void);
/* Make a directory with default permissions */
extern int MakePGDirectory(const char *directoryName); extern int MakePGDirectory(const char *directoryName);
/* Miscellaneous support routines */ /* Miscellaneous support routines */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment