postgresql/src/backend/storage/file/fd.c

1329 lines
32 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* fd.c
* Virtual file descriptor code.
*
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.87 2001/11/05 17:46:27 momjian Exp $
*
* NOTES:
*
* This code manages a cache of 'virtual' file descriptors (VFDs).
* The server opens many file descriptors for a variety of reasons,
* including base tables, scratch files (e.g., sort and hash spool
* files), and random calls to C library routines like system(3); it
* is quite easy to exceed system limits on the number of open files a
* single process can have. (This is around 256 on many modern
* operating systems, but can be as low as 32 on others.)
*
* VFDs are managed as an LRU pool, with actual OS file descriptors
* being opened and closed as needed. Obviously, if a routine is
* opened using these interfaces, all subsequent operations must also
* be through these interfaces (the File type is not a real file
* descriptor).
*
* For this scheme to work, most (if not all) routines throughout the
* server should use these interfaces instead of calling the C library
* routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
* may find ourselves short of real file descriptors anyway.
*
* This file used to contain a bunch of stuff to support RAID levels 0
* (jbod), 1 (duplex) and 5 (xor parity). That stuff is all gone
* because the parallel query processing code that called it is all
* gone. If you really need it you could get it from the original
* POSTGRES source.
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <sys/types.h>
#include <sys/file.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <dirent.h>
#include <errno.h>
#include <unistd.h>
#include <fcntl.h>
1999-07-16 05:14:30 +02:00
#include "miscadmin.h"
1996-11-08 07:02:30 +01:00
#include "storage/fd.h"
/* Filename components for OpenTemporaryFile */
#define PG_TEMP_FILES_DIR "pgsql_tmp"
#define PG_TEMP_FILE_PREFIX "pgsql_tmp"
/*
* Problem: Postgres does a system(ld...) to do dynamic loading.
* This will open several extra files in addition to those used by
* Postgres. We need to guarantee that there are file descriptors free
* for ld to use.
*
* The current solution is to limit the number of file descriptors
* that this code will allocate at one time: it leaves RESERVE_FOR_LD free.
*
* (Even though most dynamic loaders now use dlopen(3) or the
* equivalent, the OS must still open several files to perform the
* dynamic loading. And stdin/stdout/stderr count too. Keep this here.)
*/
#ifndef RESERVE_FOR_LD
#define RESERVE_FOR_LD 10
#endif
/*
* We need to ensure that we have at least some file descriptors
* available to postgreSQL after we've reserved the ones for LD,
* so we set that value here.
*
* I think 10 is an appropriate value so that's what it'll be
* for now.
*/
#ifndef FD_MINFREE
#define FD_MINFREE 10
#endif
/*
* A number of platforms return values for sysconf(_SC_OPEN_MAX) that are
* far beyond what they can really support. This GUC parameter limits what
* we will believe.
*/
int max_files_per_process = 1000;
/* Debugging.... */
#ifdef FDDEBUG
#define DO_DB(A) A
#else
#define DO_DB(A) /* A */
#endif
#define VFD_CLOSED (-1)
#define FileIsValid(file) \
((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
#define FileUnknownPos (-1)
typedef struct vfd
{
signed short fd; /* current FD, or VFD_CLOSED if none */
unsigned short fdstate; /* bitflags for VFD's state */
/* these are the assigned bits in fdstate: */
#define FD_DIRTY (1 << 0) /* written to, but not yet fsync'd */
#define FD_TEMPORARY (1 << 1) /* should be unlinked when closed */
File nextFree; /* link to next free VFD, if in freelist */
File lruMoreRecently; /* doubly linked recency-of-use list */
File lruLessRecently;
long seekPos; /* current logical file position */
char *fileName; /* name of file, or NULL for unused VFD */
/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
int fileFlags; /* open(2) flags for (re)opening the file */
int fileMode; /* mode to pass to open(2) */
} Vfd;
/*
* Virtual File Descriptor array pointer and size. This grows as
1999-05-25 18:15:34 +02:00
* needed. 'File' values are indexes into this array.
* Note that VfdCache[0] is not a usable VFD, just a list header.
*/
static Vfd *VfdCache;
static Size SizeVfdCache = 0;
/*
* Number of file descriptors known to be in use by VFD entries.
*/
static int nfile = 0;
/*
* List of stdio FILEs opened with AllocateFile.
*
* Since we don't want to encourage heavy use of AllocateFile, it seems
* OK to put a pretty small maximum limit on the number of simultaneously
* allocated files.
*/
#define MAX_ALLOCATED_FILES 32
static int numAllocatedFiles = 0;
1999-05-25 18:15:34 +02:00
static FILE *allocatedFiles[MAX_ALLOCATED_FILES];
/*
* Number of temporary files opened during the current transaction;
* this is used in generation of tempfile names.
*/
static long tempFileCounter = 0;
/*--------------------
*
* Private Routines
*
* Delete - delete a file from the Lru ring
* LruDelete - remove a file from the Lru ring and close its FD
* Insert - put a file at the front of the Lru ring
* LruInsert - put a file at the front of the Lru ring and open it
* ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
* AllocateVfd - grab a free (or new) file record (from VfdArray)
* FreeVfd - free a file record
*
* The Least Recently Used ring is a doubly linked list that begins and
* ends on element zero. Element zero is special -- it doesn't represent
* a file and its "fd" field always == VFD_CLOSED. Element zero is just an
* anchor that shows us the beginning/end of the ring.
* Only VFD elements that are currently really open (have an FD assigned) are
* in the Lru ring. Elements that are "virtually" open can be recognized
* by having a non-null fileName field.
*
* example:
*
* /--less----\ /---------\
* v \ v \
* #0 --more---> LeastRecentlyUsed --more-\ \
* ^\ | |
* \\less--> MostRecentlyUsedFile <---/ |
* \more---/ \--less--/
*
*--------------------
*/
static void Delete(File file);
static void LruDelete(File file);
static void Insert(File file);
static int LruInsert(File file);
static bool ReleaseLruFile(void);
static File AllocateVfd(void);
static void FreeVfd(File file);
static int FileAccess(File file);
static File fileNameOpenFile(FileName fileName, int fileFlags, int fileMode);
static char *filepath(const char *filename);
static long pg_nofile(void);
/*
* pg_fsync --- same as fsync except does nothing if enableFsync is off
*/
int
pg_fsync(int fd)
{
if (enableFsync)
return fsync(fd);
else
return 0;
}
/*
* pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
*
* Not all platforms have fdatasync; treat as fsync if not available.
*/
int
pg_fdatasync(int fd)
{
if (enableFsync)
{
#ifdef HAVE_FDATASYNC
return fdatasync(fd);
#else
return fsync(fd);
#endif
}
else
return 0;
}
/*
* BasicOpenFile --- same as open(2) except can free other FDs if needed
*
* This is exported for use by places that really want a plain kernel FD,
* but need to be proof against running out of FDs. Once an FD has been
* successfully returned, it is the caller's responsibility to ensure that
* it will not be leaked on elog()! Most users should *not* call this
* routine directly, but instead use the VFD abstraction level, which
* provides protection against descriptor leaks as well as management of
* files that need to be open for more than a short period of time.
*
* Ideally this should be the *only* direct call of open() in the backend.
* In practice, the postmaster calls open() directly, and there are some
* direct open() calls done early in backend startup. Those are OK since
* this module wouldn't have any open files to close at that point anyway.
*/
int
BasicOpenFile(FileName fileName, int fileFlags, int fileMode)
{
2001-03-22 05:01:46 +01:00
int fd;
tryAgain:
fd = open(fileName, fileFlags, fileMode);
if (fd >= 0)
return fd; /* success! */
if (errno == EMFILE || errno == ENFILE)
{
2001-03-22 05:01:46 +01:00
int save_errno = errno;
DO_DB(elog(DEBUG, "BasicOpenFile: not enough descs, retry, er= %d",
errno));
errno = 0;
if (ReleaseLruFile())
goto tryAgain;
errno = save_errno;
}
return -1; /* failure */
}
/*
* pg_nofile: determine number of filedescriptors that fd.c is allowed to use
*/
static long
pg_nofile(void)
{
static long no_files = 0;
/* need do this calculation only once */
if (no_files == 0)
{
/*
* Ask the system what its files-per-process limit is.
*/
#ifdef HAVE_SYSCONF
no_files = sysconf(_SC_OPEN_MAX);
if (no_files <= 0)
{
#ifdef NOFILE
no_files = (long) NOFILE;
#else
no_files = (long) max_files_per_process;
#endif
elog(DEBUG, "pg_nofile: sysconf(_SC_OPEN_MAX) failed; using %ld",
no_files);
}
#else /* !HAVE_SYSCONF */
#ifdef NOFILE
no_files = (long) NOFILE;
#else
no_files = (long) max_files_per_process;
#endif
#endif /* HAVE_SYSCONF */
/*
* Some platforms return hopelessly optimistic values. Apply a
* configurable upper limit.
*/
if (no_files > (long) max_files_per_process)
no_files = (long) max_files_per_process;
/*
* Make sure we have enough to get by after reserving some for LD.
*/
if ((no_files - RESERVE_FOR_LD) < FD_MINFREE)
elog(FATAL, "pg_nofile: insufficient file descriptors available to start backend.\n"
"\tSystem allows %ld, we need at least %d.",
no_files, RESERVE_FOR_LD + FD_MINFREE);
no_files -= RESERVE_FOR_LD;
}
return no_files;
}
#if defined(FDDEBUG)
static void
_dump_lru(void)
{
int mru = VfdCache[0].lruLessRecently;
Vfd *vfdP = &VfdCache[mru];
char buf[2048];
sprintf(buf, "LRU: MOST %d ", mru);
while (mru != 0)
{
mru = vfdP->lruLessRecently;
vfdP = &VfdCache[mru];
sprintf(buf + strlen(buf), "%d ", mru);
}
sprintf(buf + strlen(buf), "LEAST");
elog(DEBUG, buf);
}
#endif /* FDDEBUG */
static void
Delete(File file)
{
Vfd *vfdP;
Assert(file != 0);
DO_DB(elog(DEBUG, "Delete %d (%s)",
file, VfdCache[file].fileName));
DO_DB(_dump_lru());
vfdP = &VfdCache[file];
VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
DO_DB(_dump_lru());
}
static void
LruDelete(File file)
{
Vfd *vfdP;
int returnValue;
Assert(file != 0);
DO_DB(elog(DEBUG, "LruDelete %d (%s)",
file, VfdCache[file].fileName));
vfdP = &VfdCache[file];
/* delete the vfd record from the LRU ring */
Delete(file);
/* save the seek position */
vfdP->seekPos = (long) lseek(vfdP->fd, 0L, SEEK_CUR);
Assert(vfdP->seekPos != -1);
/* if we have written to the file, sync it before closing */
if (vfdP->fdstate & FD_DIRTY)
{
returnValue = pg_fsync(vfdP->fd);
Assert(returnValue != -1);
vfdP->fdstate &= ~FD_DIRTY;
}
/* close the file */
returnValue = close(vfdP->fd);
Assert(returnValue != -1);
--nfile;
vfdP->fd = VFD_CLOSED;
}
static void
Insert(File file)
{
Vfd *vfdP;
Assert(file != 0);
DO_DB(elog(DEBUG, "Insert %d (%s)",
file, VfdCache[file].fileName));
DO_DB(_dump_lru());
vfdP = &VfdCache[file];
vfdP->lruMoreRecently = 0;
vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
VfdCache[0].lruLessRecently = file;
VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
DO_DB(_dump_lru());
}
static int
LruInsert(File file)
{
Vfd *vfdP;
int returnValue;
Assert(file != 0);
DO_DB(elog(DEBUG, "LruInsert %d (%s)",
file, VfdCache[file].fileName));
vfdP = &VfdCache[file];
if (FileIsNotOpen(file))
{
1999-05-25 18:15:34 +02:00
while (nfile + numAllocatedFiles >= pg_nofile())
{
2001-03-22 05:01:46 +01:00
if (!ReleaseLruFile())
break;
}
/*
* The open could still fail for lack of file descriptors, eg due
* to overall system file table being full. So, be prepared to
* release another FD if necessary...
*/
vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
vfdP->fileMode);
if (vfdP->fd < 0)
{
DO_DB(elog(DEBUG, "RE_OPEN FAILED: %d", errno));
1998-09-01 05:29:17 +02:00
return vfdP->fd;
}
else
{
DO_DB(elog(DEBUG, "RE_OPEN SUCCESS"));
++nfile;
}
/* seek to the right position */
if (vfdP->seekPos != 0L)
{
returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
Assert(returnValue != -1);
}
}
/*
* put it at the head of the Lru ring
*/
Insert(file);
1998-09-01 05:29:17 +02:00
return 0;
}
static bool
ReleaseLruFile(void)
{
DO_DB(elog(DEBUG, "ReleaseLruFile. Opened %d", nfile));
if (nfile > 0)
{
/*
* There are opened files and so there should be at least one used
* vfd in the ring.
*/
Assert(VfdCache[0].lruMoreRecently != 0);
LruDelete(VfdCache[0].lruMoreRecently);
return true; /* freed a file */
}
return false; /* no files available to free */
}
static File
AllocateVfd(void)
{
Index i;
File file;
DO_DB(elog(DEBUG, "AllocateVfd. Size %d", SizeVfdCache));
if (SizeVfdCache == 0)
{
/* initialize header entry first time through */
VfdCache = (Vfd *) malloc(sizeof(Vfd));
if (VfdCache == NULL)
elog(FATAL, "AllocateVfd: no room for VFD array");
MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
VfdCache->fd = VFD_CLOSED;
SizeVfdCache = 1;
/*
* register proc-exit call to ensure temp files are dropped at
* exit
*/
on_proc_exit(AtEOXact_Files, 0);
}
if (VfdCache[0].nextFree == 0)
{
/*
* The free list is empty so it is time to increase the size of
* the array. We choose to double it each time this happens.
* However, there's not much point in starting *real* small.
*/
1999-05-25 18:15:34 +02:00
Size newCacheSize = SizeVfdCache * 2;
Vfd *newVfdCache;
if (newCacheSize < 32)
newCacheSize = 32;
/*
* Be careful not to clobber VfdCache ptr if realloc fails; we
* will need it during proc_exit cleanup!
*/
newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
if (newVfdCache == NULL)
elog(FATAL, "AllocateVfd: no room to enlarge VFD array");
VfdCache = newVfdCache;
/*
* Initialize the new entries and link them into the free list.
*/
for (i = SizeVfdCache; i < newCacheSize; i++)
{
MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
VfdCache[i].nextFree = i + 1;
VfdCache[i].fd = VFD_CLOSED;
}
VfdCache[newCacheSize - 1].nextFree = 0;
VfdCache[0].nextFree = SizeVfdCache;
/*
* Record the new size
*/
SizeVfdCache = newCacheSize;
}
file = VfdCache[0].nextFree;
VfdCache[0].nextFree = VfdCache[file].nextFree;
return file;
}
static void
FreeVfd(File file)
{
Vfd *vfdP = &VfdCache[file];
DO_DB(elog(DEBUG, "FreeVfd: %d (%s)",
file, vfdP->fileName ? vfdP->fileName : ""));
if (vfdP->fileName != NULL)
{
free(vfdP->fileName);
vfdP->fileName = NULL;
}
vfdP->fdstate = 0x0;
vfdP->nextFree = VfdCache[0].nextFree;
VfdCache[0].nextFree = file;
}
1997-11-07 07:38:51 +01:00
/* filepath()
* Convert given pathname to absolute.
*
* Result is a palloc'd string.
*
* (Generally, this isn't actually necessary, considering that we
* should be cd'd into the database directory. Presently it is only
* necessary to do it in "bootstrap" mode. Maybe we should change
* bootstrap mode to do the cd, and save a few cycles/bytes here.)
1997-11-07 07:38:51 +01:00
*/
static char *
filepath(const char *filename)
{
char *buf;
1997-11-07 07:38:51 +01:00
/* Not an absolute path name? Then fill in with database path... */
if (*filename != '/')
{
buf = (char *) palloc(strlen(DatabasePath) + strlen(filename) + 2);
sprintf(buf, "%s/%s", DatabasePath, filename);
}
else
buf = pstrdup(filename);
1997-11-07 07:38:51 +01:00
#ifdef FILEDEBUG
printf("filepath: path is %s\n", buf);
1997-11-07 07:38:51 +01:00
#endif
1998-09-01 05:29:17 +02:00
return buf;
}
static int
FileAccess(File file)
{
int returnValue;
DO_DB(elog(DEBUG, "FileAccess %d (%s)",
file, VfdCache[file].fileName));
/*
1999-05-25 18:15:34 +02:00
* Is the file open? If not, open it and put it at the head of the
* LRU ring (possibly closing the least recently used file to get an
* FD).
*/
if (FileIsNotOpen(file))
{
returnValue = LruInsert(file);
if (returnValue != 0)
return returnValue;
}
else if (VfdCache[0].lruLessRecently != file)
{
/*
* We now know that the file is open and that it is not the last
* one accessed, so we need to move it to the head of the Lru
* ring.
*/
Delete(file);
Insert(file);
}
1998-09-01 05:29:17 +02:00
return 0;
}
/*
* Called when we get a shared invalidation message on some relation.
*/
#ifdef NOT_USED
void
FileInvalidate(File file)
{
Assert(FileIsValid(file));
if (!FileIsNotOpen(file))
LruDelete(file);
}
#endif
static File
fileNameOpenFile(FileName fileName,
int fileFlags,
int fileMode)
{
File file;
Vfd *vfdP;
if (fileName == NULL)
elog(ERROR, "fileNameOpenFile: NULL fname");
DO_DB(elog(DEBUG, "fileNameOpenFile: %s %x %o",
fileName, fileFlags, fileMode));
file = AllocateVfd();
vfdP = &VfdCache[file];
1999-05-25 18:15:34 +02:00
while (nfile + numAllocatedFiles >= pg_nofile())
{
2001-03-22 05:01:46 +01:00
if (!ReleaseLruFile())
break;
}
vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);
if (vfdP->fd < 0)
{
FreeVfd(file);
return -1;
}
++nfile;
DO_DB(elog(DEBUG, "fileNameOpenFile: success %d",
vfdP->fd));
Insert(file);
vfdP->fileName = (char *) malloc(strlen(fileName) + 1);
if (vfdP->fileName == NULL)
elog(FATAL, "fileNameOpenFile: no room to save VFD filename");
strcpy(vfdP->fileName, fileName);
/* Saved flags are adjusted to be OK for re-opening file */
vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
vfdP->fileMode = fileMode;
vfdP->seekPos = 0;
2001-03-22 05:01:46 +01:00
2000-11-10 04:53:45 +01:00
/*
2001-03-22 05:01:46 +01:00
* Have to fsync file on commit. Alternative way - log file creation
* and fsync log before actual file creation.
2000-11-10 04:53:45 +01:00
*/
if (fileFlags & O_CREAT)
vfdP->fdstate = FD_DIRTY;
2000-11-23 02:08:57 +01:00
else
vfdP->fdstate = 0x0;
return file;
}
/*
* open a file in the database directory ($PGDATA/base/...)
*/
File
FileNameOpenFile(FileName fileName, int fileFlags, int fileMode)
{
File fd;
char *fname;
fname = filepath(fileName);
fd = fileNameOpenFile(fname, fileFlags, fileMode);
pfree(fname);
1998-09-01 05:29:17 +02:00
return fd;
}
/*
* open a file in an arbitrary directory
*/
File
PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
{
1998-09-01 05:29:17 +02:00
return fileNameOpenFile(fileName, fileFlags, fileMode);
}
/*
* Open a temporary file that will disappear when we close it.
*
* This routine takes care of generating an appropriate tempfile name.
* There's no need to pass in fileFlags or fileMode either, since only
* one setting makes any sense for a temp file.
*/
File
OpenTemporaryFile(void)
{
char tempfilepath[128];
1999-05-25 18:15:34 +02:00
File file;
1999-05-25 18:15:34 +02:00
/*
* Generate a tempfile name that's unique within the current
* transaction and database instance.
1999-05-25 18:15:34 +02:00
*/
snprintf(tempfilepath, sizeof(tempfilepath),
"%s/%s%d.%ld", PG_TEMP_FILES_DIR, PG_TEMP_FILE_PREFIX,
MyProcPid, tempFileCounter++);
/*
* Open the file. Note: we don't use O_EXCL, in case there is an
* orphaned temp file that can be reused.
*/
file = FileNameOpenFile(tempfilepath,
O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
0600);
if (file <= 0)
{
char *dirpath;
/*
* We might need to create the pg_tempfiles subdirectory, if no
* one has yet done so.
*
* Don't check for error from mkdir; it could fail if someone else
* just did the same thing. If it doesn't work then we'll bomb
* out on the second create attempt, instead.
*/
dirpath = filepath(PG_TEMP_FILES_DIR);
mkdir(dirpath, S_IRWXU);
pfree(dirpath);
file = FileNameOpenFile(tempfilepath,
O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
0600);
if (file <= 0)
elog(ERROR, "Failed to create temporary file %s", tempfilepath);
}
/* Mark it for deletion at close or EOXact */
VfdCache[file].fdstate |= FD_TEMPORARY;
return file;
}
/*
* close a file when done with it
*/
void
FileClose(File file)
{
int returnValue;
Assert(FileIsValid(file));
DO_DB(elog(DEBUG, "FileClose: %d (%s)",
file, VfdCache[file].fileName));
if (!FileIsNotOpen(file))
{
/* remove the file from the lru ring */
Delete(file);
/* if we did any writes, sync the file before closing */
if (VfdCache[file].fdstate & FD_DIRTY)
{
returnValue = pg_fsync(VfdCache[file].fd);
Assert(returnValue != -1);
VfdCache[file].fdstate &= ~FD_DIRTY;
}
/* close the file */
returnValue = close(VfdCache[file].fd);
Assert(returnValue != -1);
--nfile;
VfdCache[file].fd = VFD_CLOSED;
}
/*
* Delete the file if it was temporary
*/
if (VfdCache[file].fdstate & FD_TEMPORARY)
{
/* reset flag so that die() interrupt won't cause problems */
VfdCache[file].fdstate &= ~FD_TEMPORARY;
unlink(VfdCache[file].fileName);
}
/*
* Return the Vfd slot to the free list
*/
FreeVfd(file);
}
/*
* close a file and forcibly delete the underlying Unix file
*/
void
FileUnlink(File file)
{
Assert(FileIsValid(file));
DO_DB(elog(DEBUG, "FileUnlink: %d (%s)",
file, VfdCache[file].fileName));
/* force FileClose to delete it */
VfdCache[file].fdstate |= FD_TEMPORARY;
FileClose(file);
}
int
FileRead(File file, char *buffer, int amount)
{
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(DEBUG, "FileRead: %d (%s) %ld %d %p",
file, VfdCache[file].fileName,
VfdCache[file].seekPos, amount, buffer));
FileAccess(file);
returnCode = read(VfdCache[file].fd, buffer, amount);
if (returnCode > 0)
VfdCache[file].seekPos += returnCode;
else
VfdCache[file].seekPos = FileUnknownPos;
return returnCode;
}
int
FileWrite(File file, char *buffer, int amount)
{
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(DEBUG, "FileWrite: %d (%s) %ld %d %p",
file, VfdCache[file].fileName,
VfdCache[file].seekPos, amount, buffer));
FileAccess(file);
errno = 0;
returnCode = write(VfdCache[file].fd, buffer, amount);
/* if write didn't set errno, assume problem is no disk space */
if (returnCode != amount && errno == 0)
errno = ENOSPC;
if (returnCode > 0)
VfdCache[file].seekPos += returnCode;
else
VfdCache[file].seekPos = FileUnknownPos;
return returnCode;
}
long
FileSeek(File file, long offset, int whence)
{
Assert(FileIsValid(file));
DO_DB(elog(DEBUG, "FileSeek: %d (%s) %ld %ld %d",
file, VfdCache[file].fileName,
VfdCache[file].seekPos, offset, whence));
if (FileIsNotOpen(file))
{
switch (whence)
{
case SEEK_SET:
if (offset < 0)
elog(ERROR, "FileSeek: invalid offset: %ld", offset);
VfdCache[file].seekPos = offset;
break;
case SEEK_CUR:
VfdCache[file].seekPos += offset;
break;
case SEEK_END:
FileAccess(file);
VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
break;
default:
elog(ERROR, "FileSeek: invalid whence: %d", whence);
break;
}
}
else
{
switch (whence)
{
case SEEK_SET:
if (offset < 0)
elog(ERROR, "FileSeek: invalid offset: %ld", offset);
if (VfdCache[file].seekPos != offset)
VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
break;
case SEEK_CUR:
if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
break;
case SEEK_END:
VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
break;
default:
elog(ERROR, "FileSeek: invalid whence: %d", whence);
break;
}
}
return VfdCache[file].seekPos;
}
/*
* XXX not actually used but here for completeness
*/
#ifdef NOT_USED
long
FileTell(File file)
{
Assert(FileIsValid(file));
DO_DB(elog(DEBUG, "FileTell %d (%s)",
file, VfdCache[file].fileName));
return VfdCache[file].seekPos;
}
#endif
int
FileTruncate(File file, long offset)
{
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(DEBUG, "FileTruncate %d (%s)",
file, VfdCache[file].fileName));
FileSync(file);
FileAccess(file);
returnCode = ftruncate(VfdCache[file].fd, (size_t) offset);
1998-09-01 05:29:17 +02:00
return returnCode;
}
/*
* FileSync --- if a file is marked as dirty, fsync it.
*
* The FD_DIRTY bit is slightly misnamed: it doesn't mean that we need to
* write the file, but that we *have* written it and need to execute an
* fsync() to ensure the changes are down on disk before we mark the current
* transaction committed.
*
* FD_DIRTY is set by FileWrite or by an explicit FileMarkDirty() call.
* It is cleared after successfully fsync'ing the file. FileClose() will
* fsync a dirty File that is about to be closed, since there will be no
* other place to remember the need to fsync after the VFD is gone.
*
* Note that the DIRTY bit is logically associated with the actual disk file,
* not with any particular kernel FD we might have open for it. We assume
* that fsync will force out any dirty buffers for that file, whether or not
* they were written through the FD being used for the fsync call --- they
* might even have been written by some other backend!
*
* Note also that LruDelete currently fsyncs a dirty file that it is about
* to close the kernel file descriptor for. The idea there is to avoid
* having to re-open the kernel descriptor later. But it's not real clear
* that this is a performance win; we could end up fsyncing the same file
* multiple times in a transaction, which would probably cost more time
* than is saved by avoiding an open() call. This should be studied.
*
* This routine used to think it could skip the fsync if the file is
* physically closed, but that is now WRONG; see comments for FileMarkDirty.
*/
int
FileSync(File file)
{
int returnCode;
Assert(FileIsValid(file));
if (!(VfdCache[file].fdstate & FD_DIRTY))
{
/* Need not sync if file is not dirty. */
returnCode = 0;
}
else if (!enableFsync)
{
/* Don't force the file open if pg_fsync isn't gonna sync it. */
returnCode = 0;
VfdCache[file].fdstate &= ~FD_DIRTY;
}
else
{
/*
* We don't use FileAccess() because we don't want to force the
* file to the front of the LRU ring; we aren't expecting to
* access it again soon.
*/
if (FileIsNotOpen(file))
{
returnCode = LruInsert(file);
if (returnCode != 0)
return returnCode;
}
returnCode = pg_fsync(VfdCache[file].fd);
if (returnCode == 0)
VfdCache[file].fdstate &= ~FD_DIRTY;
}
return returnCode;
}
/*
* FileMarkDirty --- mark a file as needing fsync at transaction commit.
*
* Since FileWrite marks the file dirty, this routine is not needed in
* normal use. It is called when the buffer manager detects that some other
* backend has written out a shared buffer that this backend dirtied (but
* didn't write) in the current xact. In that scenario, we need to fsync
* the file before we can commit. We cannot assume that the other backend
* has fsync'd the file yet; we need to do our own fsync to ensure that
* (a) the disk page is written and (b) this backend's commit is delayed
* until the write is complete.
*
* Note we are assuming that an fsync issued by this backend will write
* kernel disk buffers that were dirtied by another backend. Furthermore,
* it doesn't matter whether we currently have the file physically open;
* we must fsync even if we have to re-open the file to do it.
*/
void
FileMarkDirty(File file)
{
Assert(FileIsValid(file));
DO_DB(elog(DEBUG, "FileMarkDirty: %d (%s)",
file, VfdCache[file].fileName));
VfdCache[file].fdstate |= FD_DIRTY;
}
/*
* Routines that want to use stdio (ie, FILE*) should use AllocateFile
* rather than plain fopen(). This lets fd.c deal with freeing FDs if
1999-05-25 18:15:34 +02:00
* necessary to open the file. When done, call FreeFile rather than fclose.
*
* Note that files that will be open for any significant length of time
* should NOT be handled this way, since they cannot share kernel file
* descriptors with other files; there is grave risk of running out of FDs
* if anyone locks down too many FDs. Most callers of this routine are
* simply reading a config file that they will read and close immediately.
*
* fd.c will automatically close all files opened with AllocateFile at
* transaction commit or abort; this prevents FD leakage if a routine
* that calls AllocateFile is terminated prematurely by elog(ERROR).
*
* Ideally this should be the *only* direct call of fopen() in the backend.
*/
FILE *
AllocateFile(char *name, char *mode)
{
FILE *file;
DO_DB(elog(DEBUG, "AllocateFile: Allocated %d", numAllocatedFiles));
if (numAllocatedFiles >= MAX_ALLOCATED_FILES)
elog(ERROR, "AllocateFile: too many private FDs demanded");
TryAgain:
if ((file = fopen(name, mode)) != NULL)
{
allocatedFiles[numAllocatedFiles] = file;
numAllocatedFiles++;
return file;
}
if (errno == EMFILE || errno == ENFILE)
{
2001-03-22 05:01:46 +01:00
int save_errno = errno;
DO_DB(elog(DEBUG, "AllocateFile: not enough descs, retry, er= %d",
errno));
errno = 0;
if (ReleaseLruFile())
goto TryAgain;
errno = save_errno;
}
return NULL;
}
void
FreeFile(FILE *file)
{
1999-05-25 18:15:34 +02:00
int i;
DO_DB(elog(DEBUG, "FreeFile: Allocated %d", numAllocatedFiles));
/* Remove file from list of allocated files, if it's present */
1999-05-25 18:15:34 +02:00
for (i = numAllocatedFiles; --i >= 0;)
{
if (allocatedFiles[i] == file)
{
numAllocatedFiles--;
allocatedFiles[i] = allocatedFiles[numAllocatedFiles];
break;
}
}
if (i < 0)
elog(NOTICE, "FreeFile: file was not obtained from AllocateFile");
fclose(file);
}
/*
* closeAllVfds
*
* Force all VFDs into the physically-closed state, so that the fewest
* possible number of kernel file descriptors are in use. There is no
* change in the logical state of the VFDs.
*/
void
closeAllVfds(void)
{
Index i;
if (SizeVfdCache > 0)
{
1999-05-25 18:15:34 +02:00
Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
for (i = 1; i < SizeVfdCache; i++)
{
if (!FileIsNotOpen(i))
LruDelete(i);
}
}
}
/*
* AtEOXact_Files
*
* This routine is called during transaction commit or abort or backend
* exit (it doesn't particularly care which). All still-open temporary-file
* VFDs are closed, which also causes the underlying files to be deleted.
* Furthermore, all "allocated" stdio files are closed.
*
* This routine is not involved in fsync'ing non-temporary files at xact
* commit; that is done by FileSync under control of the buffer manager.
* During a commit, that is done *before* control gets here. If we still
* have any needs-fsync bits set when we get here, we assume this is abort
* and clear them.
*/
void
AtEOXact_Files(void)
{
Index i;
if (SizeVfdCache > 0)
{
1999-05-25 18:15:34 +02:00
Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
for (i = 1; i < SizeVfdCache; i++)
{
if ((VfdCache[i].fdstate & FD_TEMPORARY) &&
VfdCache[i].fileName != NULL)
FileClose(i);
else
VfdCache[i].fdstate &= ~FD_DIRTY;
}
}
while (numAllocatedFiles > 0)
FreeFile(allocatedFiles[0]);
1999-05-25 18:15:34 +02:00
/*
* Reset the tempfile name counter to 0; not really necessary, but
* helps keep the names from growing unreasonably long.
*/
tempFileCounter = 0;
}
/*
* Remove old temporary files
*
* This should be called during postmaster startup. It will forcibly
* remove any leftover files created by OpenTemporaryFile.
*/
void
RemovePgTempFiles(void)
{
char db_path[MAXPGPATH];
char temp_path[MAXPGPATH];
char rm_path[MAXPGPATH];
DIR *db_dir;
DIR *temp_dir;
struct dirent *db_de;
struct dirent *temp_de;
/*
* Cycle through pg_tempfiles for all databases and remove old temp
* files.
*/
snprintf(db_path, sizeof(db_path), "%s/base", DataDir);
if ((db_dir = opendir(db_path)) != NULL)
{
while ((db_de = readdir(db_dir)) != NULL)
{
if (strcmp(db_de->d_name, ".") == 0 ||
strcmp(db_de->d_name, "..") == 0)
continue;
snprintf(temp_path, sizeof(temp_path),
"%s/%s/%s",
db_path, db_de->d_name,
PG_TEMP_FILES_DIR);
if ((temp_dir = opendir(temp_path)) != NULL)
{
while ((temp_de = readdir(temp_dir)) != NULL)
{
if (strcmp(temp_de->d_name, ".") == 0 ||
strcmp(temp_de->d_name, "..") == 0)
continue;
snprintf(rm_path, sizeof(temp_path),
"%s/%s/%s/%s",
db_path, db_de->d_name,
PG_TEMP_FILES_DIR,
temp_de->d_name);
if (strncmp(temp_de->d_name,
PG_TEMP_FILE_PREFIX,
strlen(PG_TEMP_FILE_PREFIX)) == 0)
unlink(rm_path);
else
{
/*
* would prefer to use elog here, but it's not up
* and running during postmaster startup...
*/
fprintf(stderr,
"Unexpected file found in temporary-files directory: %s\n",
rm_path);
}
}
closedir(temp_dir);
}
}
closedir(db_dir);
}
}