postgresql/src/backend/storage/file/fd.c
Peter Eisentraut 2eb4a831e5 Change TRUE/FALSE to true/false
The lower case spellings are C and C++ standard and are used in most
parts of the PostgreSQL sources.  The upper case spellings are only used
in some files/modules.  So standardize on the standard spellings.

The APIs for ICU, Perl, and Windows define their own TRUE and FALSE, so
those are left as is when using those APIs.

In code comments, we use the lower-case spelling for the C concepts and
keep the upper-case spelling for the SQL concepts.

Reviewed-by: Michael Paquier <michael.paquier@gmail.com>
2017-11-08 11:37:28 -05:00

3248 lines
82 KiB
C

/*-------------------------------------------------------------------------
*
* fd.c
* Virtual file descriptor code.
*
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/storage/file/fd.c
*
* NOTES:
*
* This code manages a cache of 'virtual' file descriptors (VFDs).
* The server opens many file descriptors for a variety of reasons,
* including base tables, scratch files (e.g., sort and hash spool
* files), and random calls to C library routines like system(3); it
* is quite easy to exceed system limits on the number of open files a
* single process can have. (This is around 256 on many modern
* operating systems, but can be as low as 32 on others.)
*
* VFDs are managed as an LRU pool, with actual OS file descriptors
* being opened and closed as needed. Obviously, if a routine is
* opened using these interfaces, all subsequent operations must also
* be through these interfaces (the File type is not a real file
* descriptor).
*
* For this scheme to work, most (if not all) routines throughout the
* server should use these interfaces instead of calling the C library
* routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
* may find ourselves short of real file descriptors anyway.
*
* INTERFACE ROUTINES
*
* PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
* A File opened with OpenTemporaryFile is automatically deleted when the
* File is closed, either explicitly or implicitly at end of transaction or
* process exit. PathNameOpenFile is intended for files that are held open
* for a long time, like relation files. It is the caller's responsibility
* to close them, there is no automatic mechanism in fd.c for that.
*
* AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
* wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
* They behave like the corresponding native functions, except that the handle
* is registered with the current subtransaction, and will be automatically
* closed at abort. These are intended mainly for short operations like
* reading a configuration file; there is a limit on the number of files that
* can be opened using these functions at any one time.
*
* Finally, BasicOpenFile is just a thin wrapper around open() that can
* release file descriptors in use by the virtual file descriptors if
* necessary. There is no automatic cleanup of file descriptors returned by
* BasicOpenFile, it is solely the caller's responsibility to close the file
* descriptor by calling close(2).
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <sys/file.h>
#include <sys/param.h>
#include <sys/stat.h>
#ifndef WIN32
#include <sys/mman.h>
#endif
#include <limits.h>
#include <unistd.h>
#include <fcntl.h>
#ifdef HAVE_SYS_RESOURCE_H
#include <sys/resource.h> /* for getrlimit */
#endif
#include "miscadmin.h"
#include "access/xact.h"
#include "access/xlog.h"
#include "catalog/catalog.h"
#include "catalog/pg_tablespace.h"
#include "pgstat.h"
#include "portability/mem.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "utils/guc.h"
#include "utils/resowner_private.h"
/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
#if defined(HAVE_SYNC_FILE_RANGE)
#define PG_FLUSH_DATA_WORKS 1
#elif !defined(WIN32) && defined(MS_ASYNC)
#define PG_FLUSH_DATA_WORKS 1
#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
#define PG_FLUSH_DATA_WORKS 1
#endif
/*
* We must leave some file descriptors free for system(), the dynamic loader,
* and other code that tries to open files without consulting fd.c. This
* is the number left free. (While we can be pretty sure we won't get
* EMFILE, there's never any guarantee that we won't get ENFILE due to
* other processes chewing up FDs. So it's a bad idea to try to open files
* without consulting fd.c. Nonetheless we cannot control all code.)
*
* Because this is just a fixed setting, we are effectively assuming that
* no such code will leave FDs open over the long term; otherwise the slop
* is likely to be insufficient. Note in particular that we expect that
* loading a shared library does not result in any permanent increase in
* the number of open files. (This appears to be true on most if not
* all platforms as of Feb 2004.)
*/
#define NUM_RESERVED_FDS 10
/*
* If we have fewer than this many usable FDs after allowing for the reserved
* ones, choke.
*/
#define FD_MINFREE 10
/*
* Default mode for created files, unless something else is specified using
* the *Perm() function variants.
*/
#define PG_FILE_MODE_DEFAULT (S_IRUSR | S_IWUSR)
/*
* A number of platforms allow individual processes to open many more files
* than they can really support when *many* processes do the same thing.
* This GUC parameter lets the DBA limit max_safe_fds to something less than
* what the postmaster's initial probe suggests will work.
*/
int max_files_per_process = 1000;
/*
* Maximum number of file descriptors to open for either VFD entries or
* AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
* to a conservative value, and remains that way indefinitely in bootstrap or
* standalone-backend cases. In normal postmaster operation, the postmaster
* calls set_max_safe_fds() late in initialization to update the value, and
* that value is then inherited by forked subprocesses.
*
* Note: the value of max_files_per_process is taken into account while
* setting this variable, and so need not be tested separately.
*/
int max_safe_fds = 32; /* default if not changed */
/* Debugging.... */
#ifdef FDDEBUG
#define DO_DB(A) \
do { \
int _do_db_save_errno = errno; \
A; \
errno = _do_db_save_errno; \
} while (0)
#else
#define DO_DB(A) \
((void) 0)
#endif
#define VFD_CLOSED (-1)
#define FileIsValid(file) \
((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
/*
* Note: a VFD's seekPos is normally always valid, but if for some reason
* an lseek() fails, it might become set to FileUnknownPos. We can struggle
* along without knowing the seek position in many cases, but in some places
* we have to fail if we don't have it.
*/
#define FileUnknownPos ((off_t) -1)
#define FilePosIsUnknown(pos) ((pos) < 0)
/* these are the assigned bits in fdstate below: */
#define FD_TEMPORARY (1 << 0) /* T = delete when closed */
#define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */
typedef struct vfd
{
int fd; /* current FD, or VFD_CLOSED if none */
unsigned short fdstate; /* bitflags for VFD's state */
ResourceOwner resowner; /* owner, for automatic cleanup */
File nextFree; /* link to next free VFD, if in freelist */
File lruMoreRecently; /* doubly linked recency-of-use list */
File lruLessRecently;
off_t seekPos; /* current logical file position, or -1 */
off_t fileSize; /* current size of file (0 if not temporary) */
char *fileName; /* name of file, or NULL for unused VFD */
/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
int fileFlags; /* open(2) flags for (re)opening the file */
mode_t fileMode; /* mode to pass to open(2) */
} Vfd;
/*
* Virtual File Descriptor array pointer and size. This grows as
* needed. 'File' values are indexes into this array.
* Note that VfdCache[0] is not a usable VFD, just a list header.
*/
static Vfd *VfdCache;
static Size SizeVfdCache = 0;
/*
* Number of file descriptors known to be in use by VFD entries.
*/
static int nfile = 0;
/*
* Flag to tell whether it's worth scanning VfdCache looking for temp files
* to close
*/
static bool have_xact_temporary_files = false;
/*
* Tracks the total size of all temporary files. Note: when temp_file_limit
* is being enforced, this cannot overflow since the limit cannot be more
* than INT_MAX kilobytes. When not enforcing, it could theoretically
* overflow, but we don't care.
*/
static uint64 temporary_files_size = 0;
/*
* List of OS handles opened with AllocateFile, AllocateDir and
* OpenTransientFile.
*/
typedef enum
{
AllocateDescFile,
AllocateDescPipe,
AllocateDescDir,
AllocateDescRawFD
} AllocateDescKind;
typedef struct
{
AllocateDescKind kind;
SubTransactionId create_subid;
union
{
FILE *file;
DIR *dir;
int fd;
} desc;
} AllocateDesc;
static int numAllocatedDescs = 0;
static int maxAllocatedDescs = 0;
static AllocateDesc *allocatedDescs = NULL;
/*
* Number of temporary files opened during the current session;
* this is used in generation of tempfile names.
*/
static long tempFileCounter = 0;
/*
* Array of OIDs of temp tablespaces. When numTempTableSpaces is -1,
* this has not been set in the current transaction.
*/
static Oid *tempTableSpaces = NULL;
static int numTempTableSpaces = -1;
static int nextTempTableSpace = 0;
/*--------------------
*
* Private Routines
*
* Delete - delete a file from the Lru ring
* LruDelete - remove a file from the Lru ring and close its FD
* Insert - put a file at the front of the Lru ring
* LruInsert - put a file at the front of the Lru ring and open it
* ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
* ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
* AllocateVfd - grab a free (or new) file record (from VfdArray)
* FreeVfd - free a file record
*
* The Least Recently Used ring is a doubly linked list that begins and
* ends on element zero. Element zero is special -- it doesn't represent
* a file and its "fd" field always == VFD_CLOSED. Element zero is just an
* anchor that shows us the beginning/end of the ring.
* Only VFD elements that are currently really open (have an FD assigned) are
* in the Lru ring. Elements that are "virtually" open can be recognized
* by having a non-null fileName field.
*
* example:
*
* /--less----\ /---------\
* v \ v \
* #0 --more---> LeastRecentlyUsed --more-\ \
* ^\ | |
* \\less--> MostRecentlyUsedFile <---/ |
* \more---/ \--less--/
*
*--------------------
*/
static void Delete(File file);
static void LruDelete(File file);
static void Insert(File file);
static int LruInsert(File file);
static bool ReleaseLruFile(void);
static void ReleaseLruFiles(void);
static File AllocateVfd(void);
static void FreeVfd(File file);
static int FileAccess(File file);
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
static bool reserveAllocatedDesc(void);
static int FreeDesc(AllocateDesc *desc);
static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel);
static void AtProcExit_Files(int code, Datum arg);
static void CleanupTempFiles(bool isProcExit);
static void RemovePgTempFilesInDir(const char *tmpdirname);
static void RemovePgTempRelationFiles(const char *tsdirname);
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
static bool looks_like_temp_rel_name(const char *name);
static void walkdir(const char *path,
void (*action) (const char *fname, bool isdir, int elevel),
bool process_symlinks,
int elevel);
#ifdef PG_FLUSH_DATA_WORKS
static void pre_sync_fname(const char *fname, bool isdir, int elevel);
#endif
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
static int fsync_parent_path(const char *fname, int elevel);
/*
* pg_fsync --- do fsync with or without writethrough
*/
int
pg_fsync(int fd)
{
/* #if is to skip the sync_method test if there's no need for it */
#if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
return pg_fsync_writethrough(fd);
else
#endif
return pg_fsync_no_writethrough(fd);
}
/*
* pg_fsync_no_writethrough --- same as fsync except does nothing if
* enableFsync is off
*/
int
pg_fsync_no_writethrough(int fd)
{
if (enableFsync)
return fsync(fd);
else
return 0;
}
/*
* pg_fsync_writethrough
*/
int
pg_fsync_writethrough(int fd)
{
if (enableFsync)
{
#ifdef WIN32
return _commit(fd);
#elif defined(F_FULLFSYNC)
return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
#else
errno = ENOSYS;
return -1;
#endif
}
else
return 0;
}
/*
* pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
*
* Not all platforms have fdatasync; treat as fsync if not available.
*/
int
pg_fdatasync(int fd)
{
if (enableFsync)
{
#ifdef HAVE_FDATASYNC
return fdatasync(fd);
#else
return fsync(fd);
#endif
}
else
return 0;
}
/*
* pg_flush_data --- advise OS that the described dirty data should be flushed
*
* offset of 0 with nbytes 0 means that the entire file should be flushed;
* in this case, this function may have side-effects on the file's
* seek position!
*/
void
pg_flush_data(int fd, off_t offset, off_t nbytes)
{
/*
* Right now file flushing is primarily used to avoid making later
* fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
* if fsyncs are disabled - that's a decision we might want to make
* configurable at some point.
*/
if (!enableFsync)
return;
/*
* We compile all alternatives that are supported on the current platform,
* to find portability problems more easily.
*/
#if defined(HAVE_SYNC_FILE_RANGE)
{
int rc;
/*
* sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
* tells the OS that writeback for the specified blocks should be
* started, but that we don't want to wait for completion. Note that
* this call might block if too much dirty data exists in the range.
* This is the preferable method on OSs supporting it, as it works
* reliably when available (contrast to msync()) and doesn't flush out
* clean data (like FADV_DONTNEED).
*/
rc = sync_file_range(fd, offset, nbytes,
SYNC_FILE_RANGE_WRITE);
/* don't error out, this is just a performance optimization */
if (rc != 0)
{
ereport(WARNING,
(errcode_for_file_access(),
errmsg("could not flush dirty data: %m")));
}
return;
}
#endif
#if !defined(WIN32) && defined(MS_ASYNC)
{
void *p;
static int pagesize = 0;
/*
* On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
* writeback. On linux it only does so if MS_SYNC is specified, but
* then it does the writeback synchronously. Luckily all common linux
* systems have sync_file_range(). This is preferable over
* FADV_DONTNEED because it doesn't flush out clean data.
*
* We map the file (mmap()), tell the kernel to sync back the contents
* (msync()), and then remove the mapping again (munmap()).
*/
/* mmap() needs actual length if we want to map whole file */
if (offset == 0 && nbytes == 0)
{
nbytes = lseek(fd, 0, SEEK_END);
if (nbytes < 0)
{
ereport(WARNING,
(errcode_for_file_access(),
errmsg("could not determine dirty data size: %m")));
return;
}
}
/*
* Some platforms reject partial-page mmap() attempts. To deal with
* that, just truncate the request to a page boundary. If any extra
* bytes don't get flushed, well, it's only a hint anyway.
*/
/* fetch pagesize only once */
if (pagesize == 0)
pagesize = sysconf(_SC_PAGESIZE);
/* align length to pagesize, dropping any fractional page */
if (pagesize > 0)
nbytes = (nbytes / pagesize) * pagesize;
/* fractional-page request is a no-op */
if (nbytes <= 0)
return;
/*
* mmap could well fail, particularly on 32-bit platforms where there
* may simply not be enough address space. If so, silently fall
* through to the next implementation.
*/
if (nbytes <= (off_t) SSIZE_MAX)
p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
else
p = MAP_FAILED;
if (p != MAP_FAILED)
{
int rc;
rc = msync(p, (size_t) nbytes, MS_ASYNC);
if (rc != 0)
{
ereport(WARNING,
(errcode_for_file_access(),
errmsg("could not flush dirty data: %m")));
/* NB: need to fall through to munmap()! */
}
rc = munmap(p, (size_t) nbytes);
if (rc != 0)
{
/* FATAL error because mapping would remain */
ereport(FATAL,
(errcode_for_file_access(),
errmsg("could not munmap() while flushing data: %m")));
}
return;
}
}
#endif
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
{
int rc;
/*
* Signal the kernel that the passed in range should not be cached
* anymore. This has the, desired, side effect of writing out dirty
* data, and the, undesired, side effect of likely discarding useful
* clean cached blocks. For the latter reason this is the least
* preferable method.
*/
rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
if (rc != 0)
{
/* don't error out, this is just a performance optimization */
ereport(WARNING,
(errcode_for_file_access(),
errmsg("could not flush dirty data: %m")));
}
return;
}
#endif
}
/*
* fsync_fname -- fsync a file or directory, handling errors properly
*
* Try to fsync a file or directory. When doing the latter, ignore errors that
* indicate the OS just doesn't allow/require fsyncing directories.
*/
void
fsync_fname(const char *fname, bool isdir)
{
fsync_fname_ext(fname, isdir, false, ERROR);
}
/*
* durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
*
* This routine ensures that, after returning, the effect of renaming file
* persists in case of a crash. A crash while this routine is running will
* leave you with either the pre-existing or the moved file in place of the
* new file; no mixed state or truncated files are possible.
*
* It does so by using fsync on the old filename and the possibly existing
* target filename before the rename, and the target file and directory after.
*
* Note that rename() cannot be used across arbitrary directories, as they
* might not be on the same filesystem. Therefore this routine does not
* support renaming across directories.
*
* Log errors with the caller specified severity.
*
* Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
* valid upon return.
*/
int
durable_rename(const char *oldfile, const char *newfile, int elevel)
{
int fd;
/*
* First fsync the old and target path (if it exists), to ensure that they
* are properly persistent on disk. Syncing the target file is not
* strictly necessary, but it makes it easier to reason about crashes;
* because it's then guaranteed that either source or target file exists
* after a crash.
*/
if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
return -1;
fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
if (fd < 0)
{
if (errno != ENOENT)
{
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", newfile)));
return -1;
}
}
else
{
if (pg_fsync(fd) != 0)
{
int save_errno;
/* close file upon error, might not be in transaction context */
save_errno = errno;
CloseTransientFile(fd);
errno = save_errno;
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", newfile)));
return -1;
}
CloseTransientFile(fd);
}
/* Time to do the real deal... */
if (rename(oldfile, newfile) < 0)
{
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
oldfile, newfile)));
return -1;
}
/*
* To guarantee renaming the file is persistent, fsync the file with its
* new name, and its containing directory.
*/
if (fsync_fname_ext(newfile, false, false, elevel) != 0)
return -1;
if (fsync_parent_path(newfile, elevel) != 0)
return -1;
return 0;
}
/*
* durable_unlink -- remove a file in a durable manner
*
* This routine ensures that, after returning, the effect of removing file
* persists in case of a crash. A crash while this routine is running will
* leave the system in no mixed state.
*
* It does so by using fsync on the parent directory of the file after the
* actual removal is done.
*
* Log errors with the severity specified by caller.
*
* Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
* valid upon return.
*/
int
durable_unlink(const char *fname, int elevel)
{
if (unlink(fname) < 0)
{
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not remove file \"%s\": %m",
fname)));
return -1;
}
/*
* To guarantee that the removal of the file is persistent, fsync its
* parent directory.
*/
if (fsync_parent_path(fname, elevel) != 0)
return -1;
return 0;
}
/*
* durable_link_or_rename -- rename a file in a durable manner.
*
* Similar to durable_rename(), except that this routine tries (but does not
* guarantee) not to overwrite the target file.
*
* Note that a crash in an unfortunate moment can leave you with two links to
* the target file.
*
* Log errors with the caller specified severity.
*
* Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
* valid upon return.
*/
int
durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
{
/*
* Ensure that, if we crash directly after the rename/link, a file with
* valid contents is moved into place.
*/
if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
return -1;
#if HAVE_WORKING_LINK
if (link(oldfile, newfile) < 0)
{
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not link file \"%s\" to \"%s\": %m",
oldfile, newfile)));
return -1;
}
unlink(oldfile);
#else
/* XXX: Add racy file existence check? */
if (rename(oldfile, newfile) < 0)
{
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\" to \"%s\": %m",
oldfile, newfile)));
return -1;
}
#endif
/*
* Make change persistent in case of an OS crash, both the new entry and
* its parent directory need to be flushed.
*/
if (fsync_fname_ext(newfile, false, false, elevel) != 0)
return -1;
/* Same for parent directory */
if (fsync_parent_path(newfile, elevel) != 0)
return -1;
return 0;
}
/*
* InitFileAccess --- initialize this module during backend startup
*
* This is called during either normal or standalone backend start.
* It is *not* called in the postmaster.
*/
void
InitFileAccess(void)
{
Assert(SizeVfdCache == 0); /* call me only once */
/* initialize cache header entry */
VfdCache = (Vfd *) malloc(sizeof(Vfd));
if (VfdCache == NULL)
ereport(FATAL,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
VfdCache->fd = VFD_CLOSED;
SizeVfdCache = 1;
/* register proc-exit hook to ensure temp files are dropped at exit */
on_proc_exit(AtProcExit_Files, 0);
}
/*
* count_usable_fds --- count how many FDs the system will let us open,
* and estimate how many are already open.
*
* We stop counting if usable_fds reaches max_to_probe. Note: a small
* value of max_to_probe might result in an underestimate of already_open;
* we must fill in any "gaps" in the set of used FDs before the calculation
* of already_open will give the right answer. In practice, max_to_probe
* of a couple of dozen should be enough to ensure good results.
*
* We assume stdin (FD 0) is available for dup'ing
*/
static void
count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
{
int *fd;
int size;
int used = 0;
int highestfd = 0;
int j;
#ifdef HAVE_GETRLIMIT
struct rlimit rlim;
int getrlimit_status;
#endif
size = 1024;
fd = (int *) palloc(size * sizeof(int));
#ifdef HAVE_GETRLIMIT
#ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
#else /* but BSD doesn't ... */
getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
#endif /* RLIMIT_NOFILE */
if (getrlimit_status != 0)
ereport(WARNING, (errmsg("getrlimit failed: %m")));
#endif /* HAVE_GETRLIMIT */
/* dup until failure or probe limit reached */
for (;;)
{
int thisfd;
#ifdef HAVE_GETRLIMIT
/*
* don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
* some platforms
*/
if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
break;
#endif
thisfd = dup(0);
if (thisfd < 0)
{
/* Expect EMFILE or ENFILE, else it's fishy */
if (errno != EMFILE && errno != ENFILE)
elog(WARNING, "dup(0) failed after %d successes: %m", used);
break;
}
if (used >= size)
{
size *= 2;
fd = (int *) repalloc(fd, size * sizeof(int));
}
fd[used++] = thisfd;
if (highestfd < thisfd)
highestfd = thisfd;
if (used >= max_to_probe)
break;
}
/* release the files we opened */
for (j = 0; j < used; j++)
close(fd[j]);
pfree(fd);
/*
* Return results. usable_fds is just the number of successful dups. We
* assume that the system limit is highestfd+1 (remember 0 is a legal FD
* number) and so already_open is highestfd+1 - usable_fds.
*/
*usable_fds = used;
*already_open = highestfd + 1 - used;
}
/*
* set_max_safe_fds
* Determine number of filedescriptors that fd.c is allowed to use
*/
void
set_max_safe_fds(void)
{
int usable_fds;
int already_open;
/*----------
* We want to set max_safe_fds to
* MIN(usable_fds, max_files_per_process - already_open)
* less the slop factor for files that are opened without consulting
* fd.c. This ensures that we won't exceed either max_files_per_process
* or the experimentally-determined EMFILE limit.
*----------
*/
count_usable_fds(max_files_per_process,
&usable_fds, &already_open);
max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
/*
* Take off the FDs reserved for system() etc.
*/
max_safe_fds -= NUM_RESERVED_FDS;
/*
* Make sure we still have enough to get by.
*/
if (max_safe_fds < FD_MINFREE)
ereport(FATAL,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
errmsg("insufficient file descriptors available to start server process"),
errdetail("System allows %d, we need at least %d.",
max_safe_fds + NUM_RESERVED_FDS,
FD_MINFREE + NUM_RESERVED_FDS)));
elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
max_safe_fds, usable_fds, already_open);
}
/*
* Open a file with BasicOpenFilePerm() and pass default file mode for the
* fileMode parameter.
*/
int
BasicOpenFile(const char *fileName, int fileFlags)
{
return BasicOpenFilePerm(fileName, fileFlags, PG_FILE_MODE_DEFAULT);
}
/*
* BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
*
* This is exported for use by places that really want a plain kernel FD,
* but need to be proof against running out of FDs. Once an FD has been
* successfully returned, it is the caller's responsibility to ensure that
* it will not be leaked on ereport()! Most users should *not* call this
* routine directly, but instead use the VFD abstraction level, which
* provides protection against descriptor leaks as well as management of
* files that need to be open for more than a short period of time.
*
* Ideally this should be the *only* direct call of open() in the backend.
* In practice, the postmaster calls open() directly, and there are some
* direct open() calls done early in backend startup. Those are OK since
* this module wouldn't have any open files to close at that point anyway.
*/
int
BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
{
int fd;
tryAgain:
fd = open(fileName, fileFlags, fileMode);
if (fd >= 0)
return fd; /* success! */
if (errno == EMFILE || errno == ENFILE)
{
int save_errno = errno;
ereport(LOG,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
errmsg("out of file descriptors: %m; release and retry")));
errno = 0;
if (ReleaseLruFile())
goto tryAgain;
errno = save_errno;
}
return -1; /* failure */
}
#if defined(FDDEBUG)
static void
_dump_lru(void)
{
int mru = VfdCache[0].lruLessRecently;
Vfd *vfdP = &VfdCache[mru];
char buf[2048];
snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
while (mru != 0)
{
mru = vfdP->lruLessRecently;
vfdP = &VfdCache[mru];
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
}
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
elog(LOG, "%s", buf);
}
#endif /* FDDEBUG */
static void
Delete(File file)
{
Vfd *vfdP;
Assert(file != 0);
DO_DB(elog(LOG, "Delete %d (%s)",
file, VfdCache[file].fileName));
DO_DB(_dump_lru());
vfdP = &VfdCache[file];
VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
DO_DB(_dump_lru());
}
static void
LruDelete(File file)
{
Vfd *vfdP;
Assert(file != 0);
DO_DB(elog(LOG, "LruDelete %d (%s)",
file, VfdCache[file].fileName));
vfdP = &VfdCache[file];
/*
* Normally we should know the seek position, but if for some reason we
* have lost track of it, try again to get it. If we still can't get it,
* we have a problem: we will be unable to restore the file seek position
* when and if the file is re-opened. But we can't really throw an error
* and refuse to close the file, or activities such as transaction cleanup
* will be broken.
*/
if (FilePosIsUnknown(vfdP->seekPos))
{
vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
if (FilePosIsUnknown(vfdP->seekPos))
elog(LOG, "could not seek file \"%s\" before closing: %m",
vfdP->fileName);
}
/*
* Close the file. We aren't expecting this to fail; if it does, better
* to leak the FD than to mess up our internal state.
*/
if (close(vfdP->fd))
elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
vfdP->fd = VFD_CLOSED;
--nfile;
/* delete the vfd record from the LRU ring */
Delete(file);
}
static void
Insert(File file)
{
Vfd *vfdP;
Assert(file != 0);
DO_DB(elog(LOG, "Insert %d (%s)",
file, VfdCache[file].fileName));
DO_DB(_dump_lru());
vfdP = &VfdCache[file];
vfdP->lruMoreRecently = 0;
vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
VfdCache[0].lruLessRecently = file;
VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
DO_DB(_dump_lru());
}
/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
LruInsert(File file)
{
Vfd *vfdP;
Assert(file != 0);
DO_DB(elog(LOG, "LruInsert %d (%s)",
file, VfdCache[file].fileName));
vfdP = &VfdCache[file];
if (FileIsNotOpen(file))
{
/* Close excess kernel FDs. */
ReleaseLruFiles();
/*
* The open could still fail for lack of file descriptors, eg due to
* overall system file table being full. So, be prepared to release
* another FD if necessary...
*/
vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
vfdP->fileMode);
if (vfdP->fd < 0)
{
DO_DB(elog(LOG, "re-open failed: %m"));
return -1;
}
else
{
++nfile;
}
/*
* Seek to the right position. We need no special case for seekPos
* equal to FileUnknownPos, as lseek() will certainly reject that
* (thus completing the logic noted in LruDelete() that we will fail
* to re-open a file if we couldn't get its seek position before
* closing).
*/
if (vfdP->seekPos != (off_t) 0)
{
if (lseek(vfdP->fd, vfdP->seekPos, SEEK_SET) < 0)
{
/*
* If we fail to restore the seek position, treat it like an
* open() failure.
*/
int save_errno = errno;
elog(LOG, "could not seek file \"%s\" after re-opening: %m",
vfdP->fileName);
(void) close(vfdP->fd);
vfdP->fd = VFD_CLOSED;
--nfile;
errno = save_errno;
return -1;
}
}
}
/*
* put it at the head of the Lru ring
*/
Insert(file);
return 0;
}
/*
* Release one kernel FD by closing the least-recently-used VFD.
*/
static bool
ReleaseLruFile(void)
{
DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
if (nfile > 0)
{
/*
* There are opened files and so there should be at least one used vfd
* in the ring.
*/
Assert(VfdCache[0].lruMoreRecently != 0);
LruDelete(VfdCache[0].lruMoreRecently);
return true; /* freed a file */
}
return false; /* no files available to free */
}
/*
* Release kernel FDs as needed to get under the max_safe_fds limit.
* After calling this, it's OK to try to open another file.
*/
static void
ReleaseLruFiles(void)
{
while (nfile + numAllocatedDescs >= max_safe_fds)
{
if (!ReleaseLruFile())
break;
}
}
static File
AllocateVfd(void)
{
Index i;
File file;
DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
if (VfdCache[0].nextFree == 0)
{
/*
* The free list is empty so it is time to increase the size of the
* array. We choose to double it each time this happens. However,
* there's not much point in starting *real* small.
*/
Size newCacheSize = SizeVfdCache * 2;
Vfd *newVfdCache;
if (newCacheSize < 32)
newCacheSize = 32;
/*
* Be careful not to clobber VfdCache ptr if realloc fails.
*/
newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
if (newVfdCache == NULL)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
VfdCache = newVfdCache;
/*
* Initialize the new entries and link them into the free list.
*/
for (i = SizeVfdCache; i < newCacheSize; i++)
{
MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
VfdCache[i].nextFree = i + 1;
VfdCache[i].fd = VFD_CLOSED;
}
VfdCache[newCacheSize - 1].nextFree = 0;
VfdCache[0].nextFree = SizeVfdCache;
/*
* Record the new size
*/
SizeVfdCache = newCacheSize;
}
file = VfdCache[0].nextFree;
VfdCache[0].nextFree = VfdCache[file].nextFree;
return file;
}
static void
FreeVfd(File file)
{
Vfd *vfdP = &VfdCache[file];
DO_DB(elog(LOG, "FreeVfd: %d (%s)",
file, vfdP->fileName ? vfdP->fileName : ""));
if (vfdP->fileName != NULL)
{
free(vfdP->fileName);
vfdP->fileName = NULL;
}
vfdP->fdstate = 0x0;
vfdP->nextFree = VfdCache[0].nextFree;
VfdCache[0].nextFree = file;
}
/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
FileAccess(File file)
{
int returnValue;
DO_DB(elog(LOG, "FileAccess %d (%s)",
file, VfdCache[file].fileName));
/*
* Is the file open? If not, open it and put it at the head of the LRU
* ring (possibly closing the least recently used file to get an FD).
*/
if (FileIsNotOpen(file))
{
returnValue = LruInsert(file);
if (returnValue != 0)
return returnValue;
}
else if (VfdCache[0].lruLessRecently != file)
{
/*
* We now know that the file is open and that it is not the last one
* accessed, so we need to move it to the head of the Lru ring.
*/
Delete(file);
Insert(file);
}
return 0;
}
/*
* Called when we get a shared invalidation message on some relation.
*/
#ifdef NOT_USED
void
FileInvalidate(File file)
{
Assert(FileIsValid(file));
if (!FileIsNotOpen(file))
LruDelete(file);
}
#endif
/*
* Open a file with PathNameOpenFilePerm() and pass default file mode for the
* fileMode parameter.
*/
File
PathNameOpenFile(const char *fileName, int fileFlags)
{
return PathNameOpenFilePerm(fileName, fileFlags, PG_FILE_MODE_DEFAULT);
}
/*
* open a file in an arbitrary directory
*
* NB: if the passed pathname is relative (which it usually is),
* it will be interpreted relative to the process' working directory
* (which should always be $PGDATA when this code is running).
*/
File
PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
{
char *fnamecopy;
File file;
Vfd *vfdP;
DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
fileName, fileFlags, fileMode));
/*
* We need a malloc'd copy of the file name; fail cleanly if no room.
*/
fnamecopy = strdup(fileName);
if (fnamecopy == NULL)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
file = AllocateVfd();
vfdP = &VfdCache[file];
/* Close excess kernel FDs. */
ReleaseLruFiles();
vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
if (vfdP->fd < 0)
{
int save_errno = errno;
FreeVfd(file);
free(fnamecopy);
errno = save_errno;
return -1;
}
++nfile;
DO_DB(elog(LOG, "PathNameOpenFile: success %d",
vfdP->fd));
Insert(file);
vfdP->fileName = fnamecopy;
/* Saved flags are adjusted to be OK for re-opening file */
vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
vfdP->fileMode = fileMode;
vfdP->seekPos = 0;
vfdP->fileSize = 0;
vfdP->fdstate = 0x0;
vfdP->resowner = NULL;
return file;
}
/*
* Open a temporary file that will disappear when we close it.
*
* This routine takes care of generating an appropriate tempfile name.
* There's no need to pass in fileFlags or fileMode either, since only
* one setting makes any sense for a temp file.
*
* Unless interXact is true, the file is remembered by CurrentResourceOwner
* to ensure it's closed and deleted when it's no longer needed, typically at
* the end-of-transaction. In most cases, you don't want temporary files to
* outlive the transaction that created them, so this should be false -- but
* if you need "somewhat" temporary storage, this might be useful. In either
* case, the file is removed when the File is explicitly closed.
*/
File
OpenTemporaryFile(bool interXact)
{
File file = 0;
/*
* If some temp tablespace(s) have been given to us, try to use the next
* one. If a given tablespace can't be found, we silently fall back to
* the database's default tablespace.
*
* BUT: if the temp file is slated to outlive the current transaction,
* force it into the database's default tablespace, so that it will not
* pose a threat to possible tablespace drop attempts.
*/
if (numTempTableSpaces > 0 && !interXact)
{
Oid tblspcOid = GetNextTempTableSpace();
if (OidIsValid(tblspcOid))
file = OpenTemporaryFileInTablespace(tblspcOid, false);
}
/*
* If not, or if tablespace is bad, create in database's default
* tablespace. MyDatabaseTableSpace should normally be set before we get
* here, but just in case it isn't, fall back to pg_default tablespace.
*/
if (file <= 0)
file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
MyDatabaseTableSpace :
DEFAULTTABLESPACE_OID,
true);
/* Mark it for deletion at close */
VfdCache[file].fdstate |= FD_TEMPORARY;
/* Register it with the current resource owner */
if (!interXact)
{
VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
ResourceOwnerEnlargeFiles(CurrentResourceOwner);
ResourceOwnerRememberFile(CurrentResourceOwner, file);
VfdCache[file].resowner = CurrentResourceOwner;
/* ensure cleanup happens at eoxact */
have_xact_temporary_files = true;
}
return file;
}
/*
* Open a temporary file in a specific tablespace.
* Subroutine for OpenTemporaryFile, which see for details.
*/
static File
OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
{
char tempdirpath[MAXPGPATH];
char tempfilepath[MAXPGPATH];
File file;
/*
* Identify the tempfile directory for this tablespace.
*
* If someone tries to specify pg_global, use pg_default instead.
*/
if (tblspcOid == DEFAULTTABLESPACE_OID ||
tblspcOid == GLOBALTABLESPACE_OID)
{
/* The default tablespace is {datadir}/base */
snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
PG_TEMP_FILES_DIR);
}
else
{
/* All other tablespaces are accessed via symlinks */
snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s/%s",
tblspcOid, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
}
/*
* Generate a tempfile name that should be unique within the current
* database instance.
*/
snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
/*
* Open the file. Note: we don't use O_EXCL, in case there is an orphaned
* temp file that can be reused.
*/
file = PathNameOpenFile(tempfilepath,
O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
if (file <= 0)
{
/*
* We might need to create the tablespace's tempfile directory, if no
* one has yet done so.
*
* Don't check for error from mkdir; it could fail if someone else
* just did the same thing. If it doesn't work then we'll bomb out on
* the second create attempt, instead.
*/
mkdir(tempdirpath, S_IRWXU);
file = PathNameOpenFile(tempfilepath,
O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
if (file <= 0 && rejectError)
elog(ERROR, "could not create temporary file \"%s\": %m",
tempfilepath);
}
return file;
}
/*
* close a file when done with it
*/
void
FileClose(File file)
{
Vfd *vfdP;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileClose: %d (%s)",
file, VfdCache[file].fileName));
vfdP = &VfdCache[file];
if (!FileIsNotOpen(file))
{
/* close the file */
if (close(vfdP->fd))
elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
--nfile;
vfdP->fd = VFD_CLOSED;
/* remove the file from the lru ring */
Delete(file);
}
/*
* Delete the file if it was temporary, and make a log entry if wanted
*/
if (vfdP->fdstate & FD_TEMPORARY)
{
struct stat filestats;
int stat_errno;
/*
* If we get an error, as could happen within the ereport/elog calls,
* we'll come right back here during transaction abort. Reset the
* flag to ensure that we can't get into an infinite loop. This code
* is arranged to ensure that the worst-case consequence is failing to
* emit log message(s), not failing to attempt the unlink.
*/
vfdP->fdstate &= ~FD_TEMPORARY;
/* Subtract its size from current usage (do first in case of error) */
temporary_files_size -= vfdP->fileSize;
vfdP->fileSize = 0;
/* first try the stat() */
if (stat(vfdP->fileName, &filestats))
stat_errno = errno;
else
stat_errno = 0;
/* in any case do the unlink */
if (unlink(vfdP->fileName))
elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
/* and last report the stat results */
if (stat_errno == 0)
{
pgstat_report_tempfile(filestats.st_size);
if (log_temp_files >= 0)
{
if ((filestats.st_size / 1024) >= log_temp_files)
ereport(LOG,
(errmsg("temporary file: path \"%s\", size %lu",
vfdP->fileName,
(unsigned long) filestats.st_size)));
}
}
else
{
errno = stat_errno;
elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
}
}
/* Unregister it from the resource owner */
if (vfdP->resowner)
ResourceOwnerForgetFile(vfdP->resowner, file);
/*
* Return the Vfd slot to the free list
*/
FreeVfd(file);
}
/*
* FilePrefetch - initiate asynchronous read of a given range of the file.
* The logical seek position is unaffected.
*
* Currently the only implementation of this function is using posix_fadvise
* which is the simplest standardized interface that accomplishes this.
* We could add an implementation using libaio in the future; but note that
* this API is inappropriate for libaio, which wants to have a buffer provided
* to read into.
*/
int
FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
{
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
file, VfdCache[file].fileName,
(int64) offset, amount));
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
pgstat_report_wait_start(wait_event_info);
returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
POSIX_FADV_WILLNEED);
pgstat_report_wait_end();
return returnCode;
#else
Assert(FileIsValid(file));
return 0;
#endif
}
void
FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
{
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
file, VfdCache[file].fileName,
(int64) offset, (int64) nbytes));
/*
* Caution: do not call pg_flush_data with nbytes = 0, it could trash the
* file's seek position. We prefer to define that as a no-op here.
*/
if (nbytes <= 0)
return;
returnCode = FileAccess(file);
if (returnCode < 0)
return;
pgstat_report_wait_start(wait_event_info);
pg_flush_data(VfdCache[file].fd, offset, nbytes);
pgstat_report_wait_end();
}
int
FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
{
int returnCode;
Vfd *vfdP;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
file, VfdCache[file].fileName,
(int64) VfdCache[file].seekPos,
amount, buffer));
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
vfdP = &VfdCache[file];
retry:
pgstat_report_wait_start(wait_event_info);
returnCode = read(vfdP->fd, buffer, amount);
pgstat_report_wait_end();
if (returnCode >= 0)
{
/* if seekPos is unknown, leave it that way */
if (!FilePosIsUnknown(vfdP->seekPos))
vfdP->seekPos += returnCode;
}
else
{
/*
* Windows may run out of kernel buffers and return "Insufficient
* system resources" error. Wait a bit and retry to solve it.
*
* It is rumored that EINTR is also possible on some Unix filesystems,
* in which case immediate retry is indicated.
*/
#ifdef WIN32
DWORD error = GetLastError();
switch (error)
{
case ERROR_NO_SYSTEM_RESOURCES:
pg_usleep(1000L);
errno = EINTR;
break;
default:
_dosmaperr(error);
break;
}
#endif
/* OK to retry if interrupted */
if (errno == EINTR)
goto retry;
/* Trouble, so assume we don't know the file position anymore */
vfdP->seekPos = FileUnknownPos;
}
return returnCode;
}
int
FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
{
int returnCode;
Vfd *vfdP;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
file, VfdCache[file].fileName,
(int64) VfdCache[file].seekPos,
amount, buffer));
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
vfdP = &VfdCache[file];
/*
* If enforcing temp_file_limit and it's a temp file, check to see if the
* write would overrun temp_file_limit, and throw error if so. Note: it's
* really a modularity violation to throw error here; we should set errno
* and return -1. However, there's no way to report a suitable error
* message if we do that. All current callers would just throw error
* immediately anyway, so this is safe at present.
*/
if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMPORARY))
{
off_t newPos;
/*
* Normally we should know the seek position, but if for some reason
* we have lost track of it, try again to get it. Here, it's fine to
* throw an error if we still can't get it.
*/
if (FilePosIsUnknown(vfdP->seekPos))
{
vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
if (FilePosIsUnknown(vfdP->seekPos))
elog(ERROR, "could not seek file \"%s\": %m", vfdP->fileName);
}
newPos = vfdP->seekPos + amount;
if (newPos > vfdP->fileSize)
{
uint64 newTotal = temporary_files_size;
newTotal += newPos - vfdP->fileSize;
if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
ereport(ERROR,
(errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
errmsg("temporary file size exceeds temp_file_limit (%dkB)",
temp_file_limit)));
}
}
retry:
errno = 0;
pgstat_report_wait_start(wait_event_info);
returnCode = write(vfdP->fd, buffer, amount);
pgstat_report_wait_end();
/* if write didn't set errno, assume problem is no disk space */
if (returnCode != amount && errno == 0)
errno = ENOSPC;
if (returnCode >= 0)
{
/* if seekPos is unknown, leave it that way */
if (!FilePosIsUnknown(vfdP->seekPos))
vfdP->seekPos += returnCode;
/*
* Maintain fileSize and temporary_files_size if it's a temp file.
*
* If seekPos is -1 (unknown), this will do nothing; but we could only
* get here in that state if we're not enforcing temporary_files_size,
* so we don't care.
*/
if (vfdP->fdstate & FD_TEMPORARY)
{
off_t newPos = vfdP->seekPos;
if (newPos > vfdP->fileSize)
{
temporary_files_size += newPos - vfdP->fileSize;
vfdP->fileSize = newPos;
}
}
}
else
{
/*
* See comments in FileRead()
*/
#ifdef WIN32
DWORD error = GetLastError();
switch (error)
{
case ERROR_NO_SYSTEM_RESOURCES:
pg_usleep(1000L);
errno = EINTR;
break;
default:
_dosmaperr(error);
break;
}
#endif
/* OK to retry if interrupted */
if (errno == EINTR)
goto retry;
/* Trouble, so assume we don't know the file position anymore */
vfdP->seekPos = FileUnknownPos;
}
return returnCode;
}
int
FileSync(File file, uint32 wait_event_info)
{
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileSync: %d (%s)",
file, VfdCache[file].fileName));
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
pgstat_report_wait_start(wait_event_info);
returnCode = pg_fsync(VfdCache[file].fd);
pgstat_report_wait_end();
return returnCode;
}
off_t
FileSeek(File file, off_t offset, int whence)
{
Vfd *vfdP;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
file, VfdCache[file].fileName,
(int64) VfdCache[file].seekPos,
(int64) offset, whence));
vfdP = &VfdCache[file];
if (FileIsNotOpen(file))
{
switch (whence)
{
case SEEK_SET:
if (offset < 0)
{
errno = EINVAL;
return (off_t) -1;
}
vfdP->seekPos = offset;
break;
case SEEK_CUR:
if (FilePosIsUnknown(vfdP->seekPos) ||
vfdP->seekPos + offset < 0)
{
errno = EINVAL;
return (off_t) -1;
}
vfdP->seekPos += offset;
break;
case SEEK_END:
if (FileAccess(file) < 0)
return (off_t) -1;
vfdP->seekPos = lseek(vfdP->fd, offset, whence);
break;
default:
elog(ERROR, "invalid whence: %d", whence);
break;
}
}
else
{
switch (whence)
{
case SEEK_SET:
if (offset < 0)
{
errno = EINVAL;
return (off_t) -1;
}
if (vfdP->seekPos != offset)
vfdP->seekPos = lseek(vfdP->fd, offset, whence);
break;
case SEEK_CUR:
if (offset != 0 || FilePosIsUnknown(vfdP->seekPos))
vfdP->seekPos = lseek(vfdP->fd, offset, whence);
break;
case SEEK_END:
vfdP->seekPos = lseek(vfdP->fd, offset, whence);
break;
default:
elog(ERROR, "invalid whence: %d", whence);
break;
}
}
return vfdP->seekPos;
}
/*
* XXX not actually used but here for completeness
*/
#ifdef NOT_USED
off_t
FileTell(File file)
{
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileTell %d (%s)",
file, VfdCache[file].fileName));
return VfdCache[file].seekPos;
}
#endif
int
FileTruncate(File file, off_t offset, uint32 wait_event_info)
{
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileTruncate %d (%s)",
file, VfdCache[file].fileName));
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
pgstat_report_wait_start(wait_event_info);
returnCode = ftruncate(VfdCache[file].fd, offset);
pgstat_report_wait_end();
if (returnCode == 0 && VfdCache[file].fileSize > offset)
{
/* adjust our state for truncation of a temp file */
Assert(VfdCache[file].fdstate & FD_TEMPORARY);
temporary_files_size -= VfdCache[file].fileSize - offset;
VfdCache[file].fileSize = offset;
}
return returnCode;
}
/*
* Return the pathname associated with an open file.
*
* The returned string points to an internal buffer, which is valid until
* the file is closed.
*/
char *
FilePathName(File file)
{
Assert(FileIsValid(file));
return VfdCache[file].fileName;
}
/*
* Return the raw file descriptor of an opened file.
*
* The returned file descriptor will be valid until the file is closed, but
* there are a lot of things that can make that happen. So the caller should
* be careful not to do much of anything else before it finishes using the
* returned file descriptor.
*/
int
FileGetRawDesc(File file)
{
Assert(FileIsValid(file));
return VfdCache[file].fd;
}
/*
* FileGetRawFlags - returns the file flags on open(2)
*/
int
FileGetRawFlags(File file)
{
Assert(FileIsValid(file));
return VfdCache[file].fileFlags;
}
/*
* FileGetRawMode - returns the mode bitmask passed to open(2)
*/
mode_t
FileGetRawMode(File file)
{
Assert(FileIsValid(file));
return VfdCache[file].fileMode;
}
/*
* Make room for another allocatedDescs[] array entry if needed and possible.
* Returns true if an array element is available.
*/
static bool
reserveAllocatedDesc(void)
{
AllocateDesc *newDescs;
int newMax;
/* Quick out if array already has a free slot. */
if (numAllocatedDescs < maxAllocatedDescs)
return true;
/*
* If the array hasn't yet been created in the current process, initialize
* it with FD_MINFREE / 2 elements. In many scenarios this is as many as
* we will ever need, anyway. We don't want to look at max_safe_fds
* immediately because set_max_safe_fds() may not have run yet.
*/
if (allocatedDescs == NULL)
{
newMax = FD_MINFREE / 2;
newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
/* Out of memory already? Treat as fatal error. */
if (newDescs == NULL)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
allocatedDescs = newDescs;
maxAllocatedDescs = newMax;
return true;
}
/*
* Consider enlarging the array beyond the initial allocation used above.
* By the time this happens, max_safe_fds should be known accurately.
*
* We mustn't let allocated descriptors hog all the available FDs, and in
* practice we'd better leave a reasonable number of FDs for VFD use. So
* set the maximum to max_safe_fds / 2. (This should certainly be at
* least as large as the initial size, FD_MINFREE / 2.)
*/
newMax = max_safe_fds / 2;
if (newMax > maxAllocatedDescs)
{
newDescs = (AllocateDesc *) realloc(allocatedDescs,
newMax * sizeof(AllocateDesc));
/* Treat out-of-memory as a non-fatal error. */
if (newDescs == NULL)
return false;
allocatedDescs = newDescs;
maxAllocatedDescs = newMax;
return true;
}
/* Can't enlarge allocatedDescs[] any more. */
return false;
}
/*
* Routines that want to use stdio (ie, FILE*) should use AllocateFile
* rather than plain fopen(). This lets fd.c deal with freeing FDs if
* necessary to open the file. When done, call FreeFile rather than fclose.
*
* Note that files that will be open for any significant length of time
* should NOT be handled this way, since they cannot share kernel file
* descriptors with other files; there is grave risk of running out of FDs
* if anyone locks down too many FDs. Most callers of this routine are
* simply reading a config file that they will read and close immediately.
*
* fd.c will automatically close all files opened with AllocateFile at
* transaction commit or abort; this prevents FD leakage if a routine
* that calls AllocateFile is terminated prematurely by ereport(ERROR).
*
* Ideally this should be the *only* direct call of fopen() in the backend.
*/
FILE *
AllocateFile(const char *name, const char *mode)
{
FILE *file;
DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
numAllocatedDescs, name));
/* Can we allocate another non-virtual FD? */
if (!reserveAllocatedDesc())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
maxAllocatedDescs, name)));
/* Close excess kernel FDs. */
ReleaseLruFiles();
TryAgain:
if ((file = fopen(name, mode)) != NULL)
{
AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
desc->kind = AllocateDescFile;
desc->desc.file = file;
desc->create_subid = GetCurrentSubTransactionId();
numAllocatedDescs++;
return desc->desc.file;
}
if (errno == EMFILE || errno == ENFILE)
{
int save_errno = errno;
ereport(LOG,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
errmsg("out of file descriptors: %m; release and retry")));
errno = 0;
if (ReleaseLruFile())
goto TryAgain;
errno = save_errno;
}
return NULL;
}
/*
* Open a file with OpenTransientFilePerm() and pass default file mode for
* the fileMode parameter.
*/
int
OpenTransientFile(const char *fileName, int fileFlags)
{
return OpenTransientFilePerm(fileName, fileFlags, PG_FILE_MODE_DEFAULT);
}
/*
* Like AllocateFile, but returns an unbuffered fd like open(2)
*/
int
OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
{
int fd;
DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
numAllocatedDescs, fileName));
/* Can we allocate another non-virtual FD? */
if (!reserveAllocatedDesc())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
maxAllocatedDescs, fileName)));
/* Close excess kernel FDs. */
ReleaseLruFiles();
fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
if (fd >= 0)
{
AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
desc->kind = AllocateDescRawFD;
desc->desc.fd = fd;
desc->create_subid = GetCurrentSubTransactionId();
numAllocatedDescs++;
return fd;
}
return -1; /* failure */
}
/*
* Routines that want to initiate a pipe stream should use OpenPipeStream
* rather than plain popen(). This lets fd.c deal with freeing FDs if
* necessary. When done, call ClosePipeStream rather than pclose.
*/
FILE *
OpenPipeStream(const char *command, const char *mode)
{
FILE *file;
DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
numAllocatedDescs, command));
/* Can we allocate another non-virtual FD? */
if (!reserveAllocatedDesc())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
maxAllocatedDescs, command)));
/* Close excess kernel FDs. */
ReleaseLruFiles();
TryAgain:
fflush(stdout);
fflush(stderr);
errno = 0;
if ((file = popen(command, mode)) != NULL)
{
AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
desc->kind = AllocateDescPipe;
desc->desc.file = file;
desc->create_subid = GetCurrentSubTransactionId();
numAllocatedDescs++;
return desc->desc.file;
}
if (errno == EMFILE || errno == ENFILE)
{
int save_errno = errno;
ereport(LOG,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
errmsg("out of file descriptors: %m; release and retry")));
errno = 0;
if (ReleaseLruFile())
goto TryAgain;
errno = save_errno;
}
return NULL;
}
/*
* Free an AllocateDesc of any type.
*
* The argument *must* point into the allocatedDescs[] array.
*/
static int
FreeDesc(AllocateDesc *desc)
{
int result;
/* Close the underlying object */
switch (desc->kind)
{
case AllocateDescFile:
result = fclose(desc->desc.file);
break;
case AllocateDescPipe:
result = pclose(desc->desc.file);
break;
case AllocateDescDir:
result = closedir(desc->desc.dir);
break;
case AllocateDescRawFD:
result = close(desc->desc.fd);
break;
default:
elog(ERROR, "AllocateDesc kind not recognized");
result = 0; /* keep compiler quiet */
break;
}
/* Compact storage in the allocatedDescs array */
numAllocatedDescs--;
*desc = allocatedDescs[numAllocatedDescs];
return result;
}
/*
* Close a file returned by AllocateFile.
*
* Note we do not check fclose's return value --- it is up to the caller
* to handle close errors.
*/
int
FreeFile(FILE *file)
{
int i;
DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
/* Remove file from list of allocated files, if it's present */
for (i = numAllocatedDescs; --i >= 0;)
{
AllocateDesc *desc = &allocatedDescs[i];
if (desc->kind == AllocateDescFile && desc->desc.file == file)
return FreeDesc(desc);
}
/* Only get here if someone passes us a file not in allocatedDescs */
elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
return fclose(file);
}
/*
* Close a file returned by OpenTransientFile.
*
* Note we do not check close's return value --- it is up to the caller
* to handle close errors.
*/
int
CloseTransientFile(int fd)
{
int i;
DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
/* Remove fd from list of allocated files, if it's present */
for (i = numAllocatedDescs; --i >= 0;)
{
AllocateDesc *desc = &allocatedDescs[i];
if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
return FreeDesc(desc);
}
/* Only get here if someone passes us a file not in allocatedDescs */
elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
return close(fd);
}
/*
* Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
* rather than plain opendir(). This lets fd.c deal with freeing FDs if
* necessary to open the directory, and with closing it after an elog.
* When done, call FreeDir rather than closedir.
*
* Ideally this should be the *only* direct call of opendir() in the backend.
*/
DIR *
AllocateDir(const char *dirname)
{
DIR *dir;
DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
numAllocatedDescs, dirname));
/* Can we allocate another non-virtual FD? */
if (!reserveAllocatedDesc())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
maxAllocatedDescs, dirname)));
/* Close excess kernel FDs. */
ReleaseLruFiles();
TryAgain:
if ((dir = opendir(dirname)) != NULL)
{
AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
desc->kind = AllocateDescDir;
desc->desc.dir = dir;
desc->create_subid = GetCurrentSubTransactionId();
numAllocatedDescs++;
return desc->desc.dir;
}
if (errno == EMFILE || errno == ENFILE)
{
int save_errno = errno;
ereport(LOG,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
errmsg("out of file descriptors: %m; release and retry")));
errno = 0;
if (ReleaseLruFile())
goto TryAgain;
errno = save_errno;
}
return NULL;
}
/*
* Read a directory opened with AllocateDir, ereport'ing any error.
*
* This is easier to use than raw readdir() since it takes care of some
* otherwise rather tedious and error-prone manipulation of errno. Also,
* if you are happy with a generic error message for AllocateDir failure,
* you can just do
*
* dir = AllocateDir(path);
* while ((dirent = ReadDir(dir, path)) != NULL)
* process dirent;
* FreeDir(dir);
*
* since a NULL dir parameter is taken as indicating AllocateDir failed.
* (Make sure errno hasn't been changed since AllocateDir if you use this
* shortcut.)
*
* The pathname passed to AllocateDir must be passed to this routine too,
* but it is only used for error reporting.
*/
struct dirent *
ReadDir(DIR *dir, const char *dirname)
{
return ReadDirExtended(dir, dirname, ERROR);
}
/*
* Alternate version that allows caller to specify the elevel for any
* error report. If elevel < ERROR, returns NULL on any error.
*/
static struct dirent *
ReadDirExtended(DIR *dir, const char *dirname, int elevel)
{
struct dirent *dent;
/* Give a generic message for AllocateDir failure, if caller didn't */
if (dir == NULL)
{
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not open directory \"%s\": %m",
dirname)));
return NULL;
}
errno = 0;
if ((dent = readdir(dir)) != NULL)
return dent;
if (errno)
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not read directory \"%s\": %m",
dirname)));
return NULL;
}
/*
* Close a directory opened with AllocateDir.
*
* Note we do not check closedir's return value --- it is up to the caller
* to handle close errors.
*/
int
FreeDir(DIR *dir)
{
int i;
DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
/* Remove dir from list of allocated dirs, if it's present */
for (i = numAllocatedDescs; --i >= 0;)
{
AllocateDesc *desc = &allocatedDescs[i];
if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
return FreeDesc(desc);
}
/* Only get here if someone passes us a dir not in allocatedDescs */
elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
return closedir(dir);
}
/*
* Close a pipe stream returned by OpenPipeStream.
*/
int
ClosePipeStream(FILE *file)
{
int i;
DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
/* Remove file from list of allocated files, if it's present */
for (i = numAllocatedDescs; --i >= 0;)
{
AllocateDesc *desc = &allocatedDescs[i];
if (desc->kind == AllocateDescPipe && desc->desc.file == file)
return FreeDesc(desc);
}
/* Only get here if someone passes us a file not in allocatedDescs */
elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
return pclose(file);
}
/*
* closeAllVfds
*
* Force all VFDs into the physically-closed state, so that the fewest
* possible number of kernel file descriptors are in use. There is no
* change in the logical state of the VFDs.
*/
void
closeAllVfds(void)
{
Index i;
if (SizeVfdCache > 0)
{
Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
for (i = 1; i < SizeVfdCache; i++)
{
if (!FileIsNotOpen(i))
LruDelete(i);
}
}
}
/*
* SetTempTablespaces
*
* Define a list (actually an array) of OIDs of tablespaces to use for
* temporary files. This list will be used until end of transaction,
* unless this function is called again before then. It is caller's
* responsibility that the passed-in array has adequate lifespan (typically
* it'd be allocated in TopTransactionContext).
*/
void
SetTempTablespaces(Oid *tableSpaces, int numSpaces)
{
Assert(numSpaces >= 0);
tempTableSpaces = tableSpaces;
numTempTableSpaces = numSpaces;
/*
* Select a random starting point in the list. This is to minimize
* conflicts between backends that are most likely sharing the same list
* of temp tablespaces. Note that if we create multiple temp files in the
* same transaction, we'll advance circularly through the list --- this
* ensures that large temporary sort files are nicely spread across all
* available tablespaces.
*/
if (numSpaces > 1)
nextTempTableSpace = random() % numSpaces;
else
nextTempTableSpace = 0;
}
/*
* TempTablespacesAreSet
*
* Returns true if SetTempTablespaces has been called in current transaction.
* (This is just so that tablespaces.c doesn't need its own per-transaction
* state.)
*/
bool
TempTablespacesAreSet(void)
{
return (numTempTableSpaces >= 0);
}
/*
* GetNextTempTableSpace
*
* Select the next temp tablespace to use. A result of InvalidOid means
* to use the current database's default tablespace.
*/
Oid
GetNextTempTableSpace(void)
{
if (numTempTableSpaces > 0)
{
/* Advance nextTempTableSpace counter with wraparound */
if (++nextTempTableSpace >= numTempTableSpaces)
nextTempTableSpace = 0;
return tempTableSpaces[nextTempTableSpace];
}
return InvalidOid;
}
/*
* AtEOSubXact_Files
*
* Take care of subtransaction commit/abort. At abort, we close temp files
* that the subtransaction may have opened. At commit, we reassign the
* files that were opened to the parent subtransaction.
*/
void
AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
SubTransactionId parentSubid)
{
Index i;
for (i = 0; i < numAllocatedDescs; i++)
{
if (allocatedDescs[i].create_subid == mySubid)
{
if (isCommit)
allocatedDescs[i].create_subid = parentSubid;
else
{
/* have to recheck the item after FreeDesc (ugly) */
FreeDesc(&allocatedDescs[i--]);
}
}
}
}
/*
* AtEOXact_Files
*
* This routine is called during transaction commit or abort (it doesn't
* particularly care which). All still-open per-transaction temporary file
* VFDs are closed, which also causes the underlying files to be deleted
* (although they should've been closed already by the ResourceOwner
* cleanup). Furthermore, all "allocated" stdio files are closed. We also
* forget any transaction-local temp tablespace list.
*/
void
AtEOXact_Files(void)
{
CleanupTempFiles(false);
tempTableSpaces = NULL;
numTempTableSpaces = -1;
}
/*
* AtProcExit_Files
*
* on_proc_exit hook to clean up temp files during backend shutdown.
* Here, we want to clean up *all* temp files including interXact ones.
*/
static void
AtProcExit_Files(int code, Datum arg)
{
CleanupTempFiles(true);
}
/*
* Close temporary files and delete their underlying files.
*
* isProcExit: if true, this is being called as the backend process is
* exiting. If that's the case, we should remove all temporary files; if
* that's not the case, we are being called for transaction commit/abort
* and should only remove transaction-local temp files. In either case,
* also clean up "allocated" stdio files, dirs and fds.
*/
static void
CleanupTempFiles(bool isProcExit)
{
Index i;
/*
* Careful here: at proc_exit we need extra cleanup, not just
* xact_temporary files.
*/
if (isProcExit || have_xact_temporary_files)
{
Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
for (i = 1; i < SizeVfdCache; i++)
{
unsigned short fdstate = VfdCache[i].fdstate;
if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
{
/*
* If we're in the process of exiting a backend process, close
* all temporary files. Otherwise, only close temporary files
* local to the current transaction. They should be closed by
* the ResourceOwner mechanism already, so this is just a
* debugging cross-check.
*/
if (isProcExit)
FileClose(i);
else if (fdstate & FD_XACT_TEMPORARY)
{
elog(WARNING,
"temporary file %s not closed at end-of-transaction",
VfdCache[i].fileName);
FileClose(i);
}
}
}
have_xact_temporary_files = false;
}
/* Clean up "allocated" stdio files, dirs and fds. */
while (numAllocatedDescs > 0)
FreeDesc(&allocatedDescs[0]);
}
/*
* Remove temporary and temporary relation files left over from a prior
* postmaster session
*
* This should be called during postmaster startup. It will forcibly
* remove any leftover files created by OpenTemporaryFile and any leftover
* temporary relation files created by mdcreate.
*
* NOTE: we could, but don't, call this during a post-backend-crash restart
* cycle. The argument for not doing it is that someone might want to examine
* the temp files for debugging purposes. This does however mean that
* OpenTemporaryFile had better allow for collision with an existing temp
* file name.
*/
void
RemovePgTempFiles(void)
{
char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
DIR *spc_dir;
struct dirent *spc_de;
/*
* First process temp files in pg_default ($PGDATA/base)
*/
snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
RemovePgTempFilesInDir(temp_path);
RemovePgTempRelationFiles("base");
/*
* Cycle through temp directories for all non-default tablespaces.
*/
spc_dir = AllocateDir("pg_tblspc");
while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
{
if (strcmp(spc_de->d_name, ".") == 0 ||
strcmp(spc_de->d_name, "..") == 0)
continue;
snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
RemovePgTempFilesInDir(temp_path);
snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
RemovePgTempRelationFiles(temp_path);
}
FreeDir(spc_dir);
/*
* In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
* DataDir as well.
*/
#ifdef EXEC_BACKEND
RemovePgTempFilesInDir(PG_TEMP_FILES_DIR);
#endif
}
/* Process one pgsql_tmp directory for RemovePgTempFiles */
static void
RemovePgTempFilesInDir(const char *tmpdirname)
{
DIR *temp_dir;
struct dirent *temp_de;
char rm_path[MAXPGPATH * 2];
temp_dir = AllocateDir(tmpdirname);
if (temp_dir == NULL)
{
/* anything except ENOENT is fishy */
if (errno != ENOENT)
elog(LOG,
"could not open temporary-files directory \"%s\": %m",
tmpdirname);
return;
}
while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
{
if (strcmp(temp_de->d_name, ".") == 0 ||
strcmp(temp_de->d_name, "..") == 0)
continue;
snprintf(rm_path, sizeof(rm_path), "%s/%s",
tmpdirname, temp_de->d_name);
if (strncmp(temp_de->d_name,
PG_TEMP_FILE_PREFIX,
strlen(PG_TEMP_FILE_PREFIX)) == 0)
unlink(rm_path); /* note we ignore any error */
else
elog(LOG,
"unexpected file found in temporary-files directory: \"%s\"",
rm_path);
}
FreeDir(temp_dir);
}
/* Process one tablespace directory, look for per-DB subdirectories */
static void
RemovePgTempRelationFiles(const char *tsdirname)
{
DIR *ts_dir;
struct dirent *de;
char dbspace_path[MAXPGPATH * 2];
ts_dir = AllocateDir(tsdirname);
if (ts_dir == NULL)
{
/* anything except ENOENT is fishy */
if (errno != ENOENT)
elog(LOG,
"could not open tablespace directory \"%s\": %m",
tsdirname);
return;
}
while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
{
int i = 0;
/*
* We're only interested in the per-database directories, which have
* numeric names. Note that this code will also (properly) ignore "."
* and "..".
*/
while (isdigit((unsigned char) de->d_name[i]))
++i;
if (de->d_name[i] != '\0' || i == 0)
continue;
snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
tsdirname, de->d_name);
RemovePgTempRelationFilesInDbspace(dbspace_path);
}
FreeDir(ts_dir);
}
/* Process one per-dbspace directory for RemovePgTempRelationFiles */
static void
RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
{
DIR *dbspace_dir;
struct dirent *de;
char rm_path[MAXPGPATH * 2];
dbspace_dir = AllocateDir(dbspacedirname);
if (dbspace_dir == NULL)
{
/* we just saw this directory, so it really ought to be there */
elog(LOG,
"could not open dbspace directory \"%s\": %m",
dbspacedirname);
return;
}
while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
{
if (!looks_like_temp_rel_name(de->d_name))
continue;
snprintf(rm_path, sizeof(rm_path), "%s/%s",
dbspacedirname, de->d_name);
unlink(rm_path); /* note we ignore any error */
}
FreeDir(dbspace_dir);
}
/* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
static bool
looks_like_temp_rel_name(const char *name)
{
int pos;
int savepos;
/* Must start with "t". */
if (name[0] != 't')
return false;
/* Followed by a non-empty string of digits and then an underscore. */
for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
;
if (pos == 1 || name[pos] != '_')
return false;
/* Followed by another nonempty string of digits. */
for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
;
if (savepos == pos)
return false;
/* We might have _forkname or .segment or both. */
if (name[pos] == '_')
{
int forkchar = forkname_chars(&name[pos + 1], NULL);
if (forkchar <= 0)
return false;
pos += forkchar + 1;
}
if (name[pos] == '.')
{
int segchar;
for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
;
if (segchar <= 1)
return false;
pos += segchar;
}
/* Now we should be at the end. */
if (name[pos] != '\0')
return false;
return true;
}
/*
* Issue fsync recursively on PGDATA and all its contents.
*
* We fsync regular files and directories wherever they are, but we
* follow symlinks only for pg_wal and immediately under pg_tblspc.
* Other symlinks are presumed to point at files we're not responsible
* for fsyncing, and might not have privileges to write at all.
*
* Errors are logged but not considered fatal; that's because this is used
* only during database startup, to deal with the possibility that there are
* issued-but-unsynced writes pending against the data directory. We want to
* ensure that such writes reach disk before anything that's done in the new
* run. However, aborting on error would result in failure to start for
* harmless cases such as read-only files in the data directory, and that's
* not good either.
*
* Note we assume we're chdir'd into PGDATA to begin with.
*/
void
SyncDataDirectory(void)
{
bool xlog_is_symlink;
/* We can skip this whole thing if fsync is disabled. */
if (!enableFsync)
return;
/*
* If pg_wal is a symlink, we'll need to recurse into it separately,
* because the first walkdir below will ignore it.
*/
xlog_is_symlink = false;
#ifndef WIN32
{
struct stat st;
if (lstat("pg_wal", &st) < 0)
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m",
"pg_wal")));
else if (S_ISLNK(st.st_mode))
xlog_is_symlink = true;
}
#else
if (pgwin32_is_junction("pg_wal"))
xlog_is_symlink = true;
#endif
/*
* If possible, hint to the kernel that we're soon going to fsync the data
* directory and its contents. Errors in this step are even less
* interesting than normal, so log them only at DEBUG1.
*/
#ifdef PG_FLUSH_DATA_WORKS
walkdir(".", pre_sync_fname, false, DEBUG1);
if (xlog_is_symlink)
walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
#endif
/*
* Now we do the fsync()s in the same order.
*
* The main call ignores symlinks, so in addition to specially processing
* pg_wal if it's a symlink, pg_tblspc has to be visited separately with
* process_symlinks = true. Note that if there are any plain directories
* in pg_tblspc, they'll get fsync'd twice. That's not an expected case
* so we don't worry about optimizing it.
*/
walkdir(".", datadir_fsync_fname, false, LOG);
if (xlog_is_symlink)
walkdir("pg_wal", datadir_fsync_fname, false, LOG);
walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
}
/*
* walkdir: recursively walk a directory, applying the action to each
* regular file and directory (including the named directory itself).
*
* If process_symlinks is true, the action and recursion are also applied
* to regular files and directories that are pointed to by symlinks in the
* given directory; otherwise symlinks are ignored. Symlinks are always
* ignored in subdirectories, ie we intentionally don't pass down the
* process_symlinks flag to recursive calls.
*
* Errors are reported at level elevel, which might be ERROR or less.
*
* See also walkdir in initdb.c, which is a frontend version of this logic.
*/
static void
walkdir(const char *path,
void (*action) (const char *fname, bool isdir, int elevel),
bool process_symlinks,
int elevel)
{
DIR *dir;
struct dirent *de;
dir = AllocateDir(path);
if (dir == NULL)
{
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not open directory \"%s\": %m", path)));
return;
}
while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
{
char subpath[MAXPGPATH * 2];
struct stat fst;
int sret;
CHECK_FOR_INTERRUPTS();
if (strcmp(de->d_name, ".") == 0 ||
strcmp(de->d_name, "..") == 0)
continue;
snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
if (process_symlinks)
sret = stat(subpath, &fst);
else
sret = lstat(subpath, &fst);
if (sret < 0)
{
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m", subpath)));
continue;
}
if (S_ISREG(fst.st_mode))
(*action) (subpath, false, elevel);
else if (S_ISDIR(fst.st_mode))
walkdir(subpath, action, false, elevel);
}
FreeDir(dir); /* we ignore any error here */
/*
* It's important to fsync the destination directory itself as individual
* file fsyncs don't guarantee that the directory entry for the file is
* synced.
*/
(*action) (path, true, elevel);
}
/*
* Hint to the OS that it should get ready to fsync() this file.
*
* Ignores errors trying to open unreadable files, and logs other errors at a
* caller-specified level.
*/
#ifdef PG_FLUSH_DATA_WORKS
static void
pre_sync_fname(const char *fname, bool isdir, int elevel)
{
int fd;
/* Don't try to flush directories, it'll likely just fail */
if (isdir)
return;
fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
if (fd < 0)
{
if (errno == EACCES)
return;
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", fname)));
return;
}
/*
* pg_flush_data() ignores errors, which is ok because this is only a
* hint.
*/
pg_flush_data(fd, 0, 0);
(void) CloseTransientFile(fd);
}
#endif /* PG_FLUSH_DATA_WORKS */
static void
datadir_fsync_fname(const char *fname, bool isdir, int elevel)
{
/*
* We want to silently ignoring errors about unreadable files. Pass that
* desire on to fsync_fname_ext().
*/
fsync_fname_ext(fname, isdir, true, elevel);
}
/*
* fsync_fname_ext -- Try to fsync a file or directory
*
* If ignore_perm is true, ignore errors upon trying to open unreadable
* files. Logs other errors at a caller-specified level.
*
* Returns 0 if the operation succeeded, -1 otherwise.
*/
static int
fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
{
int fd;
int flags;
int returncode;
/*
* Some OSs require directories to be opened read-only whereas other
* systems don't allow us to fsync files opened read-only; so we need both
* cases here. Using O_RDWR will cause us to fail to fsync files that are
* not writable by our userid, but we assume that's OK.
*/
flags = PG_BINARY;
if (!isdir)
flags |= O_RDWR;
else
flags |= O_RDONLY;
fd = OpenTransientFile(fname, flags);
/*
* Some OSs don't allow us to open directories at all (Windows returns
* EACCES), just ignore the error in that case. If desired also silently
* ignoring errors about unreadable files. Log others.
*/
if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
return 0;
else if (fd < 0 && ignore_perm && errno == EACCES)
return 0;
else if (fd < 0)
{
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", fname)));
return -1;
}
returncode = pg_fsync(fd);
/*
* Some OSes don't allow us to fsync directories at all, so we can ignore
* those errors. Anything else needs to be logged.
*/
if (returncode != 0 && !(isdir && errno == EBADF))
{
int save_errno;
/* close file upon error, might not be in transaction context */
save_errno = errno;
(void) CloseTransientFile(fd);
errno = save_errno;
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", fname)));
return -1;
}
(void) CloseTransientFile(fd);
return 0;
}
/*
* fsync_parent_path -- fsync the parent path of a file or directory
*
* This is aimed at making file operations persistent on disk in case of
* an OS crash or power failure.
*/
static int
fsync_parent_path(const char *fname, int elevel)
{
char parentpath[MAXPGPATH];
strlcpy(parentpath, fname, MAXPGPATH);
get_parent_directory(parentpath);
/*
* get_parent_directory() returns an empty string if the input argument is
* just a file name (see comments in path.c), so handle that as being the
* current directory.
*/
if (strlen(parentpath) == 0)
strlcpy(parentpath, ".", MAXPGPATH);
if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
return -1;
return 0;
}