postgresql/src/backend/storage/file/fd.c

1900 lines
47 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* fd.c
* Virtual file descriptor code.
*
2010-01-02 17:58:17 +01:00
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
2010-01-02 17:58:17 +01:00
* $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.152 2010/01/02 16:57:51 momjian Exp $
*
* NOTES:
*
* This code manages a cache of 'virtual' file descriptors (VFDs).
* The server opens many file descriptors for a variety of reasons,
* including base tables, scratch files (e.g., sort and hash spool
* files), and random calls to C library routines like system(3); it
* is quite easy to exceed system limits on the number of open files a
* single process can have. (This is around 256 on many modern
* operating systems, but can be as low as 32 on others.)
*
* VFDs are managed as an LRU pool, with actual OS file descriptors
* being opened and closed as needed. Obviously, if a routine is
* opened using these interfaces, all subsequent operations must also
* be through these interfaces (the File type is not a real file
* descriptor).
*
* For this scheme to work, most (if not all) routines throughout the
* server should use these interfaces instead of calling the C library
* routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
* may find ourselves short of real file descriptors anyway.
*
* This file used to contain a bunch of stuff to support RAID levels 0
* (jbod), 1 (duplex) and 5 (xor parity). That stuff is all gone
* because the parallel query processing code that called it is all
* gone. If you really need it you could get it from the original
* POSTGRES source.
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <sys/file.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#ifdef HAVE_SYS_RESOURCE_H
#include <sys/resource.h> /* for getrlimit */
#endif
1999-07-16 05:14:30 +02:00
#include "miscadmin.h"
#include "access/xact.h"
#include "catalog/pg_tablespace.h"
1996-11-08 07:02:30 +01:00
#include "storage/fd.h"
#include "storage/ipc.h"
#include "utils/guc.h"
#include "utils/resowner.h"
/*
* We must leave some file descriptors free for system(), the dynamic loader,
* and other code that tries to open files without consulting fd.c. This
* is the number left free. (While we can be pretty sure we won't get
* EMFILE, there's never any guarantee that we won't get ENFILE due to
2004-08-29 07:07:03 +02:00
* other processes chewing up FDs. So it's a bad idea to try to open files
* without consulting fd.c. Nonetheless we cannot control all code.)
*
* Because this is just a fixed setting, we are effectively assuming that
* no such code will leave FDs open over the long term; otherwise the slop
* is likely to be insufficient. Note in particular that we expect that
* loading a shared library does not result in any permanent increase in
* the number of open files. (This appears to be true on most if not
* all platforms as of Feb 2004.)
*/
#define NUM_RESERVED_FDS 10
/*
* If we have fewer than this many usable FDs after allowing for the reserved
* ones, choke.
*/
#define FD_MINFREE 10
/*
* A number of platforms allow individual processes to open many more files
* than they can really support when *many* processes do the same thing.
* This GUC parameter lets the DBA limit max_safe_fds to something less than
* what the postmaster's initial probe suggests will work.
*/
int max_files_per_process = 1000;
/*
* Maximum number of file descriptors to open for either VFD entries or
* AllocateFile/AllocateDir operations. This is initialized to a conservative
* value, and remains that way indefinitely in bootstrap or standalone-backend
* cases. In normal postmaster operation, the postmaster calls
* set_max_safe_fds() late in initialization to update the value, and that
* value is then inherited by forked subprocesses.
*
* Note: the value of max_files_per_process is taken into account while
* setting this variable, and so need not be tested separately.
*/
2004-08-29 07:07:03 +02:00
static int max_safe_fds = 32; /* default if not changed */
/* Debugging.... */
#ifdef FDDEBUG
#define DO_DB(A) A
#else
#define DO_DB(A) /* A */
#endif
#define VFD_CLOSED (-1)
#define FileIsValid(file) \
((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
#define FileUnknownPos ((off_t) -1)
This patch implements holdable cursors, following the proposal (materialization into a tuple store) discussed on pgsql-hackers earlier. I've updated the documentation and the regression tests. Notes on the implementation: - I needed to change the tuple store API slightly -- it assumes that it won't be used to hold data across transaction boundaries, so the temp files that it uses for on-disk storage are automatically reclaimed at end-of-transaction. I added a flag to tuplestore_begin_heap() to control this behavior. Is changing the tuple store API in this fashion OK? - in order to store executor results in a tuple store, I added a new CommandDest. This works well for the most part, with one exception: the current DestFunction API doesn't provide enough information to allow the Executor to store results into an arbitrary tuple store (where the particular tuple store to use is chosen by the call site of ExecutorRun). To workaround this, I've temporarily hacked up a solution that works, but is not ideal: since the receiveTuple DestFunction is passed the portal name, we can use that to lookup the Portal data structure for the cursor and then use that to get at the tuple store the Portal is using. This unnecessarily ties the Portal code with the tupleReceiver code, but it works... The proper fix for this is probably to change the DestFunction API -- Tom suggested passing the full QueryDesc to the receiveTuple function. In that case, callers of ExecutorRun could "subclass" QueryDesc to add any additional fields that their particular CommandDest needed to get access to. This approach would work, but I'd like to think about it for a little bit longer before deciding which route to go. In the mean time, the code works fine, so I don't think a fix is urgent. - (semi-related) I added a NO SCROLL keyword to DECLARE CURSOR, and adjusted the behavior of SCROLL in accordance with the discussion on -hackers. - (unrelated) Cleaned up some SGML markup in sql.sgml, copy.sgml Neil Conway
2003-03-27 17:51:29 +01:00
/* these are the assigned bits in fdstate below: */
2003-08-04 02:43:34 +02:00
#define FD_TEMPORARY (1 << 0) /* T = delete when closed */
#define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */
This patch implements holdable cursors, following the proposal (materialization into a tuple store) discussed on pgsql-hackers earlier. I've updated the documentation and the regression tests. Notes on the implementation: - I needed to change the tuple store API slightly -- it assumes that it won't be used to hold data across transaction boundaries, so the temp files that it uses for on-disk storage are automatically reclaimed at end-of-transaction. I added a flag to tuplestore_begin_heap() to control this behavior. Is changing the tuple store API in this fashion OK? - in order to store executor results in a tuple store, I added a new CommandDest. This works well for the most part, with one exception: the current DestFunction API doesn't provide enough information to allow the Executor to store results into an arbitrary tuple store (where the particular tuple store to use is chosen by the call site of ExecutorRun). To workaround this, I've temporarily hacked up a solution that works, but is not ideal: since the receiveTuple DestFunction is passed the portal name, we can use that to lookup the Portal data structure for the cursor and then use that to get at the tuple store the Portal is using. This unnecessarily ties the Portal code with the tupleReceiver code, but it works... The proper fix for this is probably to change the DestFunction API -- Tom suggested passing the full QueryDesc to the receiveTuple function. In that case, callers of ExecutorRun could "subclass" QueryDesc to add any additional fields that their particular CommandDest needed to get access to. This approach would work, but I'd like to think about it for a little bit longer before deciding which route to go. In the mean time, the code works fine, so I don't think a fix is urgent. - (semi-related) I added a NO SCROLL keyword to DECLARE CURSOR, and adjusted the behavior of SCROLL in accordance with the discussion on -hackers. - (unrelated) Cleaned up some SGML markup in sql.sgml, copy.sgml Neil Conway
2003-03-27 17:51:29 +01:00
/*
* Flag to tell whether it's worth scanning VfdCache looking for temp files to
* close
*/
static bool have_xact_temporary_files = false;
typedef struct vfd
{
int fd; /* current FD, or VFD_CLOSED if none */
unsigned short fdstate; /* bitflags for VFD's state */
ResourceOwner resowner; /* owner, for automatic cleanup */
File nextFree; /* link to next free VFD, if in freelist */
File lruMoreRecently; /* doubly linked recency-of-use list */
File lruLessRecently;
off_t seekPos; /* current logical file position */
char *fileName; /* name of file, or NULL for unused VFD */
/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
int fileFlags; /* open(2) flags for (re)opening the file */
int fileMode; /* mode to pass to open(2) */
} Vfd;
/*
* Virtual File Descriptor array pointer and size. This grows as
1999-05-25 18:15:34 +02:00
* needed. 'File' values are indexes into this array.
* Note that VfdCache[0] is not a usable VFD, just a list header.
*/
static Vfd *VfdCache;
static Size SizeVfdCache = 0;
/*
* Number of file descriptors known to be in use by VFD entries.
*/
static int nfile = 0;
/*
* List of stdio FILEs and <dirent.h> DIRs opened with AllocateFile
* and AllocateDir.
*
* Since we don't want to encourage heavy use of AllocateFile or AllocateDir,
* it seems OK to put a pretty small maximum limit on the number of
* simultaneously allocated descs.
*/
#define MAX_ALLOCATED_DESCS 32
2004-08-29 07:07:03 +02:00
typedef enum
{
AllocateDescFile,
AllocateDescDir
} AllocateDescKind;
2004-08-29 07:07:03 +02:00
typedef struct
{
AllocateDescKind kind;
union
{
FILE *file;
DIR *dir;
} desc;
SubTransactionId create_subid;
} AllocateDesc;
2004-08-29 07:07:03 +02:00
static int numAllocatedDescs = 0;
static AllocateDesc allocatedDescs[MAX_ALLOCATED_DESCS];
/*
* Number of temporary files opened during the current session;
* this is used in generation of tempfile names.
*/
static long tempFileCounter = 0;
/*
* Array of OIDs of temp tablespaces. When numTempTableSpaces is -1,
* this has not been set in the current transaction.
*/
static Oid *tempTableSpaces = NULL;
static int numTempTableSpaces = -1;
static int nextTempTableSpace = 0;
/*--------------------
*
* Private Routines
*
* Delete - delete a file from the Lru ring
* LruDelete - remove a file from the Lru ring and close its FD
* Insert - put a file at the front of the Lru ring
* LruInsert - put a file at the front of the Lru ring and open it
* ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
* AllocateVfd - grab a free (or new) file record (from VfdArray)
* FreeVfd - free a file record
*
* The Least Recently Used ring is a doubly linked list that begins and
* ends on element zero. Element zero is special -- it doesn't represent
* a file and its "fd" field always == VFD_CLOSED. Element zero is just an
* anchor that shows us the beginning/end of the ring.
* Only VFD elements that are currently really open (have an FD assigned) are
* in the Lru ring. Elements that are "virtually" open can be recognized
* by having a non-null fileName field.
*
* example:
*
* /--less----\ /---------\
* v \ v \
* #0 --more---> LeastRecentlyUsed --more-\ \
* ^\ | |
* \\less--> MostRecentlyUsedFile <---/ |
* \more---/ \--less--/
*
*--------------------
*/
static void Delete(File file);
static void LruDelete(File file);
static void Insert(File file);
static int LruInsert(File file);
static bool ReleaseLruFile(void);
static File AllocateVfd(void);
static void FreeVfd(File file);
static int FileAccess(File file);
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
static void AtProcExit_Files(int code, Datum arg);
static void CleanupTempFiles(bool isProcExit);
static void RemovePgTempFilesInDir(const char *tmpdirname);
/*
* pg_fsync --- do fsync with or without writethrough
*/
int
pg_fsync(int fd)
{
#ifndef HAVE_FSYNC_WRITETHROUGH_ONLY
if (sync_method != SYNC_METHOD_FSYNC_WRITETHROUGH)
return pg_fsync_no_writethrough(fd);
else
#endif
return pg_fsync_writethrough(fd);
}
/*
* pg_fsync_no_writethrough --- same as fsync except does nothing if
* enableFsync is off
*/
int
pg_fsync_no_writethrough(int fd)
{
if (enableFsync)
return fsync(fd);
else
return 0;
}
/*
* pg_fsync_writethrough
*/
int
pg_fsync_writethrough(int fd)
{
if (enableFsync)
{
#ifdef WIN32
return _commit(fd);
#elif defined(F_FULLFSYNC)
return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
#else
return -1;
#endif
}
else
return 0;
}
/*
* pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
*
* Not all platforms have fdatasync; treat as fsync if not available.
*/
int
pg_fdatasync(int fd)
{
if (enableFsync)
{
#ifdef HAVE_FDATASYNC
return fdatasync(fd);
#else
return fsync(fd);
#endif
}
else
return 0;
}
/*
* InitFileAccess --- initialize this module during backend startup
*
* This is called during either normal or standalone backend start.
* It is *not* called in the postmaster.
*/
void
InitFileAccess(void)
{
2005-10-15 04:49:52 +02:00
Assert(SizeVfdCache == 0); /* call me only once */
/* initialize cache header entry */
VfdCache = (Vfd *) malloc(sizeof(Vfd));
if (VfdCache == NULL)
ereport(FATAL,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
VfdCache->fd = VFD_CLOSED;
SizeVfdCache = 1;
/* register proc-exit hook to ensure temp files are dropped at exit */
on_proc_exit(AtProcExit_Files, 0);
}
/*
* count_usable_fds --- count how many FDs the system will let us open,
* and estimate how many are already open.
*
* We stop counting if usable_fds reaches max_to_probe. Note: a small
* value of max_to_probe might result in an underestimate of already_open;
* we must fill in any "gaps" in the set of used FDs before the calculation
2005-10-15 04:49:52 +02:00
* of already_open will give the right answer. In practice, max_to_probe
* of a couple of dozen should be enough to ensure good results.
*
* We assume stdin (FD 0) is available for dup'ing
*/
static void
count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
{
int *fd;
int size;
int used = 0;
int highestfd = 0;
int j;
#ifdef HAVE_GETRLIMIT
struct rlimit rlim;
int getrlimit_status;
#endif
size = 1024;
fd = (int *) palloc(size * sizeof(int));
#ifdef HAVE_GETRLIMIT
#ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
#else /* but BSD doesn't ... */
getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
#endif /* RLIMIT_NOFILE */
if (getrlimit_status != 0)
ereport(WARNING, (errmsg("getrlimit failed: %m")));
#endif /* HAVE_GETRLIMIT */
/* dup until failure or probe limit reached */
for (;;)
{
2004-08-29 07:07:03 +02:00
int thisfd;
#ifdef HAVE_GETRLIMIT
/*
* don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
* some platforms
*/
if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
break;
#endif
thisfd = dup(0);
if (thisfd < 0)
{
/* Expect EMFILE or ENFILE, else it's fishy */
if (errno != EMFILE && errno != ENFILE)
elog(WARNING, "dup(0) failed after %d successes: %m", used);
break;
}
if (used >= size)
{
size *= 2;
fd = (int *) repalloc(fd, size * sizeof(int));
}
fd[used++] = thisfd;
if (highestfd < thisfd)
highestfd = thisfd;
if (used >= max_to_probe)
break;
}
/* release the files we opened */
for (j = 0; j < used; j++)
close(fd[j]);
pfree(fd);
/*
2005-10-15 04:49:52 +02:00
* Return results. usable_fds is just the number of successful dups. We
* assume that the system limit is highestfd+1 (remember 0 is a legal FD
* number) and so already_open is highestfd+1 - usable_fds.
*/
*usable_fds = used;
2004-08-29 07:07:03 +02:00
*already_open = highestfd + 1 - used;
}
/*
* set_max_safe_fds
* Determine number of filedescriptors that fd.c is allowed to use
*/
void
set_max_safe_fds(void)
{
int usable_fds;
int already_open;
/*----------
* We want to set max_safe_fds to
* MIN(usable_fds, max_files_per_process - already_open)
* less the slop factor for files that are opened without consulting
* fd.c. This ensures that we won't exceed either max_files_per_process
* or the experimentally-determined EMFILE limit.
*----------
*/
count_usable_fds(max_files_per_process,
&usable_fds, &already_open);
max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
/*
* Take off the FDs reserved for system() etc.
*/
max_safe_fds -= NUM_RESERVED_FDS;
/*
* Make sure we still have enough to get by.
*/
if (max_safe_fds < FD_MINFREE)
ereport(FATAL,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
errmsg("insufficient file descriptors available to start server process"),
errdetail("System allows %d, we need at least %d.",
max_safe_fds + NUM_RESERVED_FDS,
FD_MINFREE + NUM_RESERVED_FDS)));
elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
max_safe_fds, usable_fds, already_open);
}
/*
* BasicOpenFile --- same as open(2) except can free other FDs if needed
*
* This is exported for use by places that really want a plain kernel FD,
* but need to be proof against running out of FDs. Once an FD has been
* successfully returned, it is the caller's responsibility to ensure that
2003-08-04 02:43:34 +02:00
* it will not be leaked on ereport()! Most users should *not* call this
* routine directly, but instead use the VFD abstraction level, which
* provides protection against descriptor leaks as well as management of
* files that need to be open for more than a short period of time.
*
* Ideally this should be the *only* direct call of open() in the backend.
* In practice, the postmaster calls open() directly, and there are some
* direct open() calls done early in backend startup. Those are OK since
* this module wouldn't have any open files to close at that point anyway.
*/
int
BasicOpenFile(FileName fileName, int fileFlags, int fileMode)
{
2001-03-22 05:01:46 +01:00
int fd;
tryAgain:
fd = open(fileName, fileFlags, fileMode);
if (fd >= 0)
return fd; /* success! */
if (errno == EMFILE || errno == ENFILE)
{
2001-03-22 05:01:46 +01:00
int save_errno = errno;
ereport(LOG,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2005-10-15 04:49:52 +02:00
errmsg("out of file descriptors: %m; release and retry")));
errno = 0;
if (ReleaseLruFile())
goto tryAgain;
errno = save_errno;
}
return -1; /* failure */
}
#if defined(FDDEBUG)
static void
_dump_lru(void)
{
int mru = VfdCache[0].lruLessRecently;
Vfd *vfdP = &VfdCache[mru];
char buf[2048];
snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
while (mru != 0)
{
mru = vfdP->lruLessRecently;
vfdP = &VfdCache[mru];
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
}
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
elog(LOG, buf);
}
#endif /* FDDEBUG */
static void
Delete(File file)
{
Vfd *vfdP;
Assert(file != 0);
DO_DB(elog(LOG, "Delete %d (%s)",
file, VfdCache[file].fileName));
DO_DB(_dump_lru());
vfdP = &VfdCache[file];
VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
DO_DB(_dump_lru());
}
static void
LruDelete(File file)
{
Vfd *vfdP;
Assert(file != 0);
DO_DB(elog(LOG, "LruDelete %d (%s)",
file, VfdCache[file].fileName));
vfdP = &VfdCache[file];
/* delete the vfd record from the LRU ring */
Delete(file);
/* save the seek position */
vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
Assert(vfdP->seekPos != (off_t) -1);
/* close the file */
if (close(vfdP->fd))
elog(ERROR, "could not close file \"%s\": %m", vfdP->fileName);
--nfile;
vfdP->fd = VFD_CLOSED;
}
static void
Insert(File file)
{
Vfd *vfdP;
Assert(file != 0);
DO_DB(elog(LOG, "Insert %d (%s)",
file, VfdCache[file].fileName));
DO_DB(_dump_lru());
vfdP = &VfdCache[file];
vfdP->lruMoreRecently = 0;
vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
VfdCache[0].lruLessRecently = file;
VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
DO_DB(_dump_lru());
}
/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
LruInsert(File file)
{
Vfd *vfdP;
Assert(file != 0);
DO_DB(elog(LOG, "LruInsert %d (%s)",
file, VfdCache[file].fileName));
vfdP = &VfdCache[file];
if (FileIsNotOpen(file))
{
while (nfile + numAllocatedDescs >= max_safe_fds)
{
2001-03-22 05:01:46 +01:00
if (!ReleaseLruFile())
break;
}
/*
2005-10-15 04:49:52 +02:00
* The open could still fail for lack of file descriptors, eg due to
* overall system file table being full. So, be prepared to release
* another FD if necessary...
*/
vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
vfdP->fileMode);
if (vfdP->fd < 0)
{
DO_DB(elog(LOG, "RE_OPEN FAILED: %d", errno));
1998-09-01 05:29:17 +02:00
return vfdP->fd;
}
else
{
DO_DB(elog(LOG, "RE_OPEN SUCCESS"));
++nfile;
}
/* seek to the right position */
if (vfdP->seekPos != (off_t) 0)
{
off_t returnValue;
returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
Assert(returnValue != (off_t) -1);
}
}
/*
* put it at the head of the Lru ring
*/
Insert(file);
1998-09-01 05:29:17 +02:00
return 0;
}
static bool
ReleaseLruFile(void)
{
DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
if (nfile > 0)
{
/*
2005-10-15 04:49:52 +02:00
* There are opened files and so there should be at least one used vfd
* in the ring.
*/
Assert(VfdCache[0].lruMoreRecently != 0);
LruDelete(VfdCache[0].lruMoreRecently);
return true; /* freed a file */
}
return false; /* no files available to free */
}
static File
AllocateVfd(void)
{
Index i;
File file;
DO_DB(elog(LOG, "AllocateVfd. Size %lu", SizeVfdCache));
2005-10-15 04:49:52 +02:00
Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
if (VfdCache[0].nextFree == 0)
{
/*
2005-10-15 04:49:52 +02:00
* The free list is empty so it is time to increase the size of the
* array. We choose to double it each time this happens. However,
* there's not much point in starting *real* small.
*/
1999-05-25 18:15:34 +02:00
Size newCacheSize = SizeVfdCache * 2;
Vfd *newVfdCache;
if (newCacheSize < 32)
newCacheSize = 32;
/*
* Be careful not to clobber VfdCache ptr if realloc fails.
*/
newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
if (newVfdCache == NULL)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
VfdCache = newVfdCache;
/*
* Initialize the new entries and link them into the free list.
*/
for (i = SizeVfdCache; i < newCacheSize; i++)
{
MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
VfdCache[i].nextFree = i + 1;
VfdCache[i].fd = VFD_CLOSED;
}
VfdCache[newCacheSize - 1].nextFree = 0;
VfdCache[0].nextFree = SizeVfdCache;
/*
* Record the new size
*/
SizeVfdCache = newCacheSize;
}
file = VfdCache[0].nextFree;
VfdCache[0].nextFree = VfdCache[file].nextFree;
return file;
}
static void
FreeVfd(File file)
{
Vfd *vfdP = &VfdCache[file];
DO_DB(elog(LOG, "FreeVfd: %d (%s)",
file, vfdP->fileName ? vfdP->fileName : ""));
if (vfdP->fileName != NULL)
{
free(vfdP->fileName);
vfdP->fileName = NULL;
}
vfdP->fdstate = 0x0;
vfdP->nextFree = VfdCache[0].nextFree;
VfdCache[0].nextFree = file;
}
/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
FileAccess(File file)
{
int returnValue;
DO_DB(elog(LOG, "FileAccess %d (%s)",
file, VfdCache[file].fileName));
/*
2005-10-15 04:49:52 +02:00
* Is the file open? If not, open it and put it at the head of the LRU
* ring (possibly closing the least recently used file to get an FD).
*/
if (FileIsNotOpen(file))
{
returnValue = LruInsert(file);
if (returnValue != 0)
return returnValue;
}
else if (VfdCache[0].lruLessRecently != file)
{
/*
2005-10-15 04:49:52 +02:00
* We now know that the file is open and that it is not the last one
* accessed, so we need to move it to the head of the Lru ring.
*/
Delete(file);
Insert(file);
}
1998-09-01 05:29:17 +02:00
return 0;
}
/*
* Called when we get a shared invalidation message on some relation.
*/
#ifdef NOT_USED
void
FileInvalidate(File file)
{
Assert(FileIsValid(file));
if (!FileIsNotOpen(file))
LruDelete(file);
}
#endif
/*
* open a file in an arbitrary directory
*
* NB: if the passed pathname is relative (which it usually is),
* it will be interpreted relative to the process' working directory
* (which should always be $PGDATA when this code is running).
*/
File
PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
{
char *fnamecopy;
File file;
Vfd *vfdP;
DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
fileName, fileFlags, fileMode));
/*
* We need a malloc'd copy of the file name; fail cleanly if no room.
*/
fnamecopy = strdup(fileName);
if (fnamecopy == NULL)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
file = AllocateVfd();
vfdP = &VfdCache[file];
while (nfile + numAllocatedDescs >= max_safe_fds)
{
2001-03-22 05:01:46 +01:00
if (!ReleaseLruFile())
break;
}
vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);
if (vfdP->fd < 0)
{
FreeVfd(file);
free(fnamecopy);
return -1;
}
++nfile;
DO_DB(elog(LOG, "PathNameOpenFile: success %d",
vfdP->fd));
Insert(file);
vfdP->fileName = fnamecopy;
/* Saved flags are adjusted to be OK for re-opening file */
vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
vfdP->fileMode = fileMode;
vfdP->seekPos = 0;
vfdP->fdstate = 0x0;
vfdP->resowner = NULL;
return file;
}
/*
* Open a temporary file that will disappear when we close it.
*
* This routine takes care of generating an appropriate tempfile name.
* There's no need to pass in fileFlags or fileMode either, since only
* one setting makes any sense for a temp file.
This patch implements holdable cursors, following the proposal (materialization into a tuple store) discussed on pgsql-hackers earlier. I've updated the documentation and the regression tests. Notes on the implementation: - I needed to change the tuple store API slightly -- it assumes that it won't be used to hold data across transaction boundaries, so the temp files that it uses for on-disk storage are automatically reclaimed at end-of-transaction. I added a flag to tuplestore_begin_heap() to control this behavior. Is changing the tuple store API in this fashion OK? - in order to store executor results in a tuple store, I added a new CommandDest. This works well for the most part, with one exception: the current DestFunction API doesn't provide enough information to allow the Executor to store results into an arbitrary tuple store (where the particular tuple store to use is chosen by the call site of ExecutorRun). To workaround this, I've temporarily hacked up a solution that works, but is not ideal: since the receiveTuple DestFunction is passed the portal name, we can use that to lookup the Portal data structure for the cursor and then use that to get at the tuple store the Portal is using. This unnecessarily ties the Portal code with the tupleReceiver code, but it works... The proper fix for this is probably to change the DestFunction API -- Tom suggested passing the full QueryDesc to the receiveTuple function. In that case, callers of ExecutorRun could "subclass" QueryDesc to add any additional fields that their particular CommandDest needed to get access to. This approach would work, but I'd like to think about it for a little bit longer before deciding which route to go. In the mean time, the code works fine, so I don't think a fix is urgent. - (semi-related) I added a NO SCROLL keyword to DECLARE CURSOR, and adjusted the behavior of SCROLL in accordance with the discussion on -hackers. - (unrelated) Cleaned up some SGML markup in sql.sgml, copy.sgml Neil Conway
2003-03-27 17:51:29 +01:00
*
* Unless interXact is true, the file is remembered by CurrentResourceOwner
* to ensure it's closed and deleted when it's no longer needed, typically at
* the end-of-transaction. In most cases, you don't want temporary files to
* outlive the transaction that created them, so this should be false -- but
* if you need "somewhat" temporary storage, this might be useful. In either
* case, the file is removed when the File is explicitly closed.
*/
File
OpenTemporaryFile(bool interXact)
{
File file = 0;
/*
* If some temp tablespace(s) have been given to us, try to use the next
2007-11-15 22:14:46 +01:00
* one. If a given tablespace can't be found, we silently fall back to
* the database's default tablespace.
*
* BUT: if the temp file is slated to outlive the current transaction,
2007-11-15 22:14:46 +01:00
* force it into the database's default tablespace, so that it will not
* pose a threat to possible tablespace drop attempts.
*/
if (numTempTableSpaces > 0 && !interXact)
{
2007-11-15 22:14:46 +01:00
Oid tblspcOid = GetNextTempTableSpace();
if (OidIsValid(tblspcOid))
file = OpenTemporaryFileInTablespace(tblspcOid, false);
}
/*
* If not, or if tablespace is bad, create in database's default
2007-11-15 22:14:46 +01:00
* tablespace. MyDatabaseTableSpace should normally be set before we get
* here, but just in case it isn't, fall back to pg_default tablespace.
*/
if (file <= 0)
file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
MyDatabaseTableSpace :
DEFAULTTABLESPACE_OID,
true);
/* Mark it for deletion at close */
VfdCache[file].fdstate |= FD_TEMPORARY;
/* Register it with the current resource owner */
if (!interXact)
{
VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
ResourceOwnerEnlargeFiles(CurrentResourceOwner);
ResourceOwnerRememberFile(CurrentResourceOwner, file);
VfdCache[file].resowner = CurrentResourceOwner;
/* ensure cleanup happens at eoxact */
have_xact_temporary_files = true;
}
return file;
}
/*
* Open a temporary file in a specific tablespace.
* Subroutine for OpenTemporaryFile, which see for details.
*/
static File
OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
{
char tempdirpath[MAXPGPATH];
char tempfilepath[MAXPGPATH];
1999-05-25 18:15:34 +02:00
File file;
/*
* Identify the tempfile directory for this tablespace.
*
* If someone tries to specify pg_global, use pg_default instead.
*/
if (tblspcOid == DEFAULTTABLESPACE_OID ||
tblspcOid == GLOBALTABLESPACE_OID)
{
/* The default tablespace is {datadir}/base */
snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
PG_TEMP_FILES_DIR);
}
else
{
/* All other tablespaces are accessed via symlinks */
snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s",
tblspcOid, PG_TEMP_FILES_DIR);
}
1999-05-25 18:15:34 +02:00
/*
* Generate a tempfile name that should be unique within the current
* database instance.
1999-05-25 18:15:34 +02:00
*/
snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
/*
2005-10-15 04:49:52 +02:00
* Open the file. Note: we don't use O_EXCL, in case there is an orphaned
* temp file that can be reused.
*/
file = PathNameOpenFile(tempfilepath,
O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
0600);
if (file <= 0)
{
/*
2007-11-15 22:14:46 +01:00
* We might need to create the tablespace's tempfile directory, if no
* one has yet done so.
*
* Don't check for error from mkdir; it could fail if someone else
* just did the same thing. If it doesn't work then we'll bomb out on
* the second create attempt, instead.
*/
mkdir(tempdirpath, S_IRWXU);
file = PathNameOpenFile(tempfilepath,
O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
0600);
if (file <= 0 && rejectError)
elog(ERROR, "could not create temporary file \"%s\": %m",
tempfilepath);
}
return file;
}
/*
* close a file when done with it
*/
void
FileClose(File file)
{
2007-11-15 22:14:46 +01:00
Vfd *vfdP;
struct stat filestats;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileClose: %d (%s)",
file, VfdCache[file].fileName));
vfdP = &VfdCache[file];
if (!FileIsNotOpen(file))
{
/* remove the file from the lru ring */
Delete(file);
/* close the file */
if (close(vfdP->fd))
elog(ERROR, "could not close file \"%s\": %m", vfdP->fileName);
--nfile;
vfdP->fd = VFD_CLOSED;
}
/*
* Delete the file if it was temporary
*/
if (vfdP->fdstate & FD_TEMPORARY)
{
/* reset flag so that die() interrupt won't cause problems */
vfdP->fdstate &= ~FD_TEMPORARY;
if (log_temp_files >= 0)
{
if (stat(vfdP->fileName, &filestats) == 0)
{
if (filestats.st_size >= log_temp_files)
ereport(LOG,
2007-12-13 12:55:44 +01:00
(errmsg("temporary file: path \"%s\", size %lu",
vfdP->fileName,
(unsigned long) filestats.st_size)));
}
else
elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
}
if (unlink(vfdP->fileName))
elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
}
/* Unregister it from the resource owner */
if (vfdP->resowner)
ResourceOwnerForgetFile(vfdP->resowner, file);
/*
* Return the Vfd slot to the free list
*/
FreeVfd(file);
}
/*
* FilePrefetch - initiate asynchronous read of a given range of the file.
* The logical seek position is unaffected.
*
* Currently the only implementation of this function is using posix_fadvise
* which is the simplest standardized interface that accomplishes this.
* We could add an implementation using libaio in the future; but note that
* this API is inappropriate for libaio, which wants to have a buffer provided
* to read into.
*/
int
FilePrefetch(File file, off_t offset, int amount)
{
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
file, VfdCache[file].fileName,
(int64) offset, amount));
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
POSIX_FADV_WILLNEED);
return returnCode;
#else
Assert(FileIsValid(file));
return 0;
#endif
}
int
FileRead(File file, char *buffer, int amount)
{
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
file, VfdCache[file].fileName,
(int64) VfdCache[file].seekPos,
amount, buffer));
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
retry:
returnCode = read(VfdCache[file].fd, buffer, amount);
if (returnCode >= 0)
VfdCache[file].seekPos += returnCode;
else
{
/*
* Windows may run out of kernel buffers and return "Insufficient
* system resources" error. Wait a bit and retry to solve it.
*
* It is rumored that EINTR is also possible on some Unix filesystems,
* in which case immediate retry is indicated.
*/
#ifdef WIN32
2006-10-04 02:30:14 +02:00
DWORD error = GetLastError();
switch (error)
{
case ERROR_NO_SYSTEM_RESOURCES:
pg_usleep(1000L);
errno = EINTR;
break;
default:
_dosmaperr(error);
break;
}
#endif
/* OK to retry if interrupted */
if (errno == EINTR)
goto retry;
/* Trouble, so assume we don't know the file position anymore */
VfdCache[file].seekPos = FileUnknownPos;
}
return returnCode;
}
int
FileWrite(File file, char *buffer, int amount)
{
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
file, VfdCache[file].fileName,
(int64) VfdCache[file].seekPos,
amount, buffer));
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
retry:
errno = 0;
returnCode = write(VfdCache[file].fd, buffer, amount);
/* if write didn't set errno, assume problem is no disk space */
if (returnCode != amount && errno == 0)
errno = ENOSPC;
if (returnCode >= 0)
VfdCache[file].seekPos += returnCode;
else
{
/*
* See comments in FileRead()
*/
#ifdef WIN32
2006-10-04 02:30:14 +02:00
DWORD error = GetLastError();
switch (error)
{
case ERROR_NO_SYSTEM_RESOURCES:
pg_usleep(1000L);
errno = EINTR;
break;
default:
_dosmaperr(error);
break;
}
#endif
/* OK to retry if interrupted */
if (errno == EINTR)
goto retry;
/* Trouble, so assume we don't know the file position anymore */
VfdCache[file].seekPos = FileUnknownPos;
}
return returnCode;
}
int
FileSync(File file)
{
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileSync: %d (%s)",
file, VfdCache[file].fileName));
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
return pg_fsync(VfdCache[file].fd);
}
off_t
FileSeek(File file, off_t offset, int whence)
{
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
file, VfdCache[file].fileName,
(int64) VfdCache[file].seekPos,
(int64) offset, whence));
if (FileIsNotOpen(file))
{
switch (whence)
{
case SEEK_SET:
if (offset < 0)
elog(ERROR, "invalid seek offset: " INT64_FORMAT,
(int64) offset);
VfdCache[file].seekPos = offset;
break;
case SEEK_CUR:
VfdCache[file].seekPos += offset;
break;
case SEEK_END:
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
VfdCache[file].seekPos = lseek(VfdCache[file].fd,
offset, whence);
break;
default:
elog(ERROR, "invalid whence: %d", whence);
break;
}
}
else
{
switch (whence)
{
case SEEK_SET:
if (offset < 0)
elog(ERROR, "invalid seek offset: " INT64_FORMAT,
(int64) offset);
if (VfdCache[file].seekPos != offset)
VfdCache[file].seekPos = lseek(VfdCache[file].fd,
offset, whence);
break;
case SEEK_CUR:
if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
VfdCache[file].seekPos = lseek(VfdCache[file].fd,
offset, whence);
break;
case SEEK_END:
VfdCache[file].seekPos = lseek(VfdCache[file].fd,
offset, whence);
break;
default:
elog(ERROR, "invalid whence: %d", whence);
break;
}
}
return VfdCache[file].seekPos;
}
/*
* XXX not actually used but here for completeness
*/
#ifdef NOT_USED
off_t
FileTell(File file)
{
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileTell %d (%s)",
file, VfdCache[file].fileName));
return VfdCache[file].seekPos;
}
#endif
int
FileTruncate(File file, off_t offset)
{
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileTruncate %d (%s)",
file, VfdCache[file].fileName));
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
returnCode = ftruncate(VfdCache[file].fd, offset);
1998-09-01 05:29:17 +02:00
return returnCode;
}
/*
* Return the pathname associated with an open file.
*
* The returned string points to an internal buffer, which is valid until
* the file is closed.
*/
char *
FilePathName(File file)
{
Assert(FileIsValid(file));
return VfdCache[file].fileName;
}
/*
* Routines that want to use stdio (ie, FILE*) should use AllocateFile
* rather than plain fopen(). This lets fd.c deal with freeing FDs if
1999-05-25 18:15:34 +02:00
* necessary to open the file. When done, call FreeFile rather than fclose.
*
* Note that files that will be open for any significant length of time
* should NOT be handled this way, since they cannot share kernel file
* descriptors with other files; there is grave risk of running out of FDs
* if anyone locks down too many FDs. Most callers of this routine are
* simply reading a config file that they will read and close immediately.
*
* fd.c will automatically close all files opened with AllocateFile at
* transaction commit or abort; this prevents FD leakage if a routine
* that calls AllocateFile is terminated prematurely by ereport(ERROR).
*
* Ideally this should be the *only* direct call of fopen() in the backend.
*/
FILE *
AllocateFile(const char *name, const char *mode)
{
FILE *file;
DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
numAllocatedDescs, name));
/*
* The test against MAX_ALLOCATED_DESCS prevents us from overflowing
2005-10-15 04:49:52 +02:00
* allocatedFiles[]; the test against max_safe_fds prevents AllocateFile
* from hogging every one of the available FDs, which'd lead to infinite
* looping.
*/
if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
numAllocatedDescs >= max_safe_fds - 1)
elog(ERROR, "too many private files demanded");
TryAgain:
if ((file = fopen(name, mode)) != NULL)
{
AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
desc->kind = AllocateDescFile;
desc->desc.file = file;
desc->create_subid = GetCurrentSubTransactionId();
numAllocatedDescs++;
return desc->desc.file;
}
if (errno == EMFILE || errno == ENFILE)
{
2001-03-22 05:01:46 +01:00
int save_errno = errno;
ereport(LOG,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2005-10-15 04:49:52 +02:00
errmsg("out of file descriptors: %m; release and retry")));
errno = 0;
if (ReleaseLruFile())
goto TryAgain;
errno = save_errno;
}
return NULL;
}
/*
* Free an AllocateDesc of either type.
*
* The argument *must* point into the allocatedDescs[] array.
*/
static int
FreeDesc(AllocateDesc *desc)
{
2004-08-29 07:07:03 +02:00
int result;
/* Close the underlying object */
switch (desc->kind)
{
case AllocateDescFile:
result = fclose(desc->desc.file);
break;
case AllocateDescDir:
result = closedir(desc->desc.dir);
break;
default:
elog(ERROR, "AllocateDesc kind not recognized");
result = 0; /* keep compiler quiet */
break;
}
/* Compact storage in the allocatedDescs array */
numAllocatedDescs--;
*desc = allocatedDescs[numAllocatedDescs];
return result;
}
/*
* Close a file returned by AllocateFile.
*
* Note we do not check fclose's return value --- it is up to the caller
* to handle close errors.
*/
int
FreeFile(FILE *file)
{
1999-05-25 18:15:34 +02:00
int i;
DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
/* Remove file from list of allocated files, if it's present */
for (i = numAllocatedDescs; --i >= 0;)
{
AllocateDesc *desc = &allocatedDescs[i];
if (desc->kind == AllocateDescFile && desc->desc.file == file)
return FreeDesc(desc);
}
/* Only get here if someone passes us a file not in allocatedDescs */
elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
return fclose(file);
}
/*
* Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
* rather than plain opendir(). This lets fd.c deal with freeing FDs if
* necessary to open the directory, and with closing it after an elog.
* When done, call FreeDir rather than closedir.
*
* Ideally this should be the *only* direct call of opendir() in the backend.
*/
DIR *
AllocateDir(const char *dirname)
{
2004-08-29 07:07:03 +02:00
DIR *dir;
DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
numAllocatedDescs, dirname));
/*
* The test against MAX_ALLOCATED_DESCS prevents us from overflowing
2005-10-15 04:49:52 +02:00
* allocatedDescs[]; the test against max_safe_fds prevents AllocateDir
* from hogging every one of the available FDs, which'd lead to infinite
* looping.
*/
if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
numAllocatedDescs >= max_safe_fds - 1)
elog(ERROR, "too many private dirs demanded");
TryAgain:
if ((dir = opendir(dirname)) != NULL)
{
AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
desc->kind = AllocateDescDir;
desc->desc.dir = dir;
desc->create_subid = GetCurrentSubTransactionId();
numAllocatedDescs++;
return desc->desc.dir;
}
if (errno == EMFILE || errno == ENFILE)
{
int save_errno = errno;
ereport(LOG,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2005-10-15 04:49:52 +02:00
errmsg("out of file descriptors: %m; release and retry")));
errno = 0;
if (ReleaseLruFile())
goto TryAgain;
errno = save_errno;
}
return NULL;
}
/*
* Read a directory opened with AllocateDir, ereport'ing any error.
*
* This is easier to use than raw readdir() since it takes care of some
2005-10-15 04:49:52 +02:00
* otherwise rather tedious and error-prone manipulation of errno. Also,
* if you are happy with a generic error message for AllocateDir failure,
* you can just do
*
* dir = AllocateDir(path);
* while ((dirent = ReadDir(dir, path)) != NULL)
* process dirent;
2005-12-08 16:38:29 +01:00
* FreeDir(dir);
*
* since a NULL dir parameter is taken as indicating AllocateDir failed.
* (Make sure errno hasn't been changed since AllocateDir if you use this
* shortcut.)
*
* The pathname passed to AllocateDir must be passed to this routine too,
* but it is only used for error reporting.
*/
struct dirent *
ReadDir(DIR *dir, const char *dirname)
{
struct dirent *dent;
/* Give a generic message for AllocateDir failure, if caller didn't */
if (dir == NULL)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open directory \"%s\": %m",
dirname)));
errno = 0;
if ((dent = readdir(dir)) != NULL)
return dent;
#ifdef WIN32
2005-10-15 04:49:52 +02:00
/*
2005-10-15 04:49:52 +02:00
* This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but not in
* released version
*/
if (GetLastError() == ERROR_NO_MORE_FILES)
errno = 0;
#endif
if (errno)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read directory \"%s\": %m",
dirname)));
return NULL;
}
/*
* Close a directory opened with AllocateDir.
*
* Note we do not check closedir's return value --- it is up to the caller
* to handle close errors.
*/
int
FreeDir(DIR *dir)
{
int i;
DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
/* Remove dir from list of allocated dirs, if it's present */
for (i = numAllocatedDescs; --i >= 0;)
{
AllocateDesc *desc = &allocatedDescs[i];
if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
return FreeDesc(desc);
}
/* Only get here if someone passes us a dir not in allocatedDescs */
elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
return closedir(dir);
}
/*
* closeAllVfds
*
* Force all VFDs into the physically-closed state, so that the fewest
* possible number of kernel file descriptors are in use. There is no
* change in the logical state of the VFDs.
*/
void
closeAllVfds(void)
{
Index i;
if (SizeVfdCache > 0)
{
1999-05-25 18:15:34 +02:00
Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
for (i = 1; i < SizeVfdCache; i++)
{
if (!FileIsNotOpen(i))
LruDelete(i);
}
}
}
/*
* SetTempTablespaces
*
* Define a list (actually an array) of OIDs of tablespaces to use for
* temporary files. This list will be used until end of transaction,
* unless this function is called again before then. It is caller's
* responsibility that the passed-in array has adequate lifespan (typically
* it'd be allocated in TopTransactionContext).
*/
void
SetTempTablespaces(Oid *tableSpaces, int numSpaces)
{
Assert(numSpaces >= 0);
tempTableSpaces = tableSpaces;
numTempTableSpaces = numSpaces;
2007-11-15 22:14:46 +01:00
/*
2007-11-15 22:14:46 +01:00
* Select a random starting point in the list. This is to minimize
* conflicts between backends that are most likely sharing the same list
* of temp tablespaces. Note that if we create multiple temp files in the
* same transaction, we'll advance circularly through the list --- this
* ensures that large temporary sort files are nicely spread across all
* available tablespaces.
*/
if (numSpaces > 1)
nextTempTableSpace = random() % numSpaces;
else
nextTempTableSpace = 0;
}
/*
* TempTablespacesAreSet
*
* Returns TRUE if SetTempTablespaces has been called in current transaction.
* (This is just so that tablespaces.c doesn't need its own per-transaction
* state.)
*/
bool
TempTablespacesAreSet(void)
{
return (numTempTableSpaces >= 0);
}
/*
* GetNextTempTableSpace
*
2007-11-15 22:14:46 +01:00
* Select the next temp tablespace to use. A result of InvalidOid means
* to use the current database's default tablespace.
*/
Oid
GetNextTempTableSpace(void)
{
if (numTempTableSpaces > 0)
{
/* Advance nextTempTableSpace counter with wraparound */
if (++nextTempTableSpace >= numTempTableSpaces)
nextTempTableSpace = 0;
return tempTableSpaces[nextTempTableSpace];
}
return InvalidOid;
}
/*
* AtEOSubXact_Files
*
* Take care of subtransaction commit/abort. At abort, we close temp files
* that the subtransaction may have opened. At commit, we reassign the
* files that were opened to the parent subtransaction.
*/
void
AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
SubTransactionId parentSubid)
{
2004-08-29 07:07:03 +02:00
Index i;
for (i = 0; i < numAllocatedDescs; i++)
{
if (allocatedDescs[i].create_subid == mySubid)
{
if (isCommit)
allocatedDescs[i].create_subid = parentSubid;
else
{
/* have to recheck the item after FreeDesc (ugly) */
FreeDesc(&allocatedDescs[i--]);
}
}
}
}
/*
* AtEOXact_Files
*
* This routine is called during transaction commit or abort (it doesn't
* particularly care which). All still-open per-transaction temporary file
* VFDs are closed, which also causes the underlying files to be deleted
* (although they should've been closed already by the ResourceOwner
* cleanup). Furthermore, all "allocated" stdio files are closed. We also
* forget any transaction-local temp tablespace list.
*/
void
AtEOXact_Files(void)
{
CleanupTempFiles(false);
tempTableSpaces = NULL;
numTempTableSpaces = -1;
}
/*
* AtProcExit_Files
*
* on_proc_exit hook to clean up temp files during backend shutdown.
* Here, we want to clean up *all* temp files including interXact ones.
*/
static void
AtProcExit_Files(int code, Datum arg)
{
CleanupTempFiles(true);
}
/*
* Close temporary files and delete their underlying files.
*
* isProcExit: if true, this is being called as the backend process is
* exiting. If that's the case, we should remove all temporary files; if
* that's not the case, we are being called for transaction commit/abort
* and should only remove transaction-local temp files. In either case,
* also clean up "allocated" stdio files and dirs.
*/
static void
CleanupTempFiles(bool isProcExit)
{
2003-08-04 02:43:34 +02:00
Index i;
/*
* Careful here: at proc_exit we need extra cleanup, not just
* xact_temporary files.
*/
if (isProcExit || have_xact_temporary_files)
{
1999-05-25 18:15:34 +02:00
Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
for (i = 1; i < SizeVfdCache; i++)
{
unsigned short fdstate = VfdCache[i].fdstate;
if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
{
/*
2005-10-15 04:49:52 +02:00
* If we're in the process of exiting a backend process, close
* all temporary files. Otherwise, only close temporary files
* local to the current transaction. They should be closed
* by the ResourceOwner mechanism already, so this is just
* a debugging cross-check.
*/
if (isProcExit)
FileClose(i);
else if (fdstate & FD_XACT_TEMPORARY)
{
elog(WARNING,
"temporary file %s not closed at end-of-transaction",
VfdCache[i].fileName);
FileClose(i);
}
}
}
have_xact_temporary_files = false;
}
/* Clean up "allocated" stdio files and dirs. */
while (numAllocatedDescs > 0)
FreeDesc(&allocatedDescs[0]);
}
/*
* Remove temporary files left over from a prior postmaster session
*
* This should be called during postmaster startup. It will forcibly
* remove any leftover files created by OpenTemporaryFile.
*
* NOTE: we could, but don't, call this during a post-backend-crash restart
* cycle. The argument for not doing it is that someone might want to examine
* the temp files for debugging purposes. This does however mean that
* OpenTemporaryFile had better allow for collision with an existing temp
* file name.
*/
void
RemovePgTempFiles(void)
{
char temp_path[MAXPGPATH];
DIR *spc_dir;
struct dirent *spc_de;
/*
* First process temp files in pg_default ($PGDATA/base)
*/
snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
RemovePgTempFilesInDir(temp_path);
/*
* Cycle through temp directories for all non-default tablespaces.
*/
spc_dir = AllocateDir("pg_tblspc");
while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
{
if (strcmp(spc_de->d_name, ".") == 0 ||
strcmp(spc_de->d_name, "..") == 0)
continue;
snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
spc_de->d_name, PG_TEMP_FILES_DIR);
RemovePgTempFilesInDir(temp_path);
}
FreeDir(spc_dir);
/*
2005-10-15 04:49:52 +02:00
* In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
* DataDir as well.
*/
#ifdef EXEC_BACKEND
RemovePgTempFilesInDir(PG_TEMP_FILES_DIR);
#endif
}
/* Process one pgsql_tmp directory for RemovePgTempFiles */
static void
RemovePgTempFilesInDir(const char *tmpdirname)
{
DIR *temp_dir;
struct dirent *temp_de;
char rm_path[MAXPGPATH];
temp_dir = AllocateDir(tmpdirname);
if (temp_dir == NULL)
{
/* anything except ENOENT is fishy */
if (errno != ENOENT)
elog(LOG,
"could not open temporary-files directory \"%s\": %m",
tmpdirname);
return;
}
while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
{
if (strcmp(temp_de->d_name, ".") == 0 ||
strcmp(temp_de->d_name, "..") == 0)
continue;
snprintf(rm_path, sizeof(rm_path), "%s/%s",
tmpdirname, temp_de->d_name);
if (strncmp(temp_de->d_name,
PG_TEMP_FILE_PREFIX,
strlen(PG_TEMP_FILE_PREFIX)) == 0)
unlink(rm_path); /* note we ignore any error */
else
elog(LOG,
"unexpected file found in temporary-files directory: \"%s\"",
rm_path);
}
FreeDir(temp_dir);
}