1999-10-13 17:02:32 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* buffile.c
|
2017-11-25 19:19:43 +01:00
|
|
|
* Management of large buffered temporary files.
|
1999-10-13 17:02:32 +02:00
|
|
|
*
|
2019-01-02 18:44:25 +01:00
|
|
|
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1999-10-13 17:02:32 +02:00
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/storage/file/buffile.c
|
1999-10-13 17:02:32 +02:00
|
|
|
*
|
|
|
|
* NOTES:
|
|
|
|
*
|
|
|
|
* BufFiles provide a very incomplete emulation of stdio atop virtual Files
|
|
|
|
* (as managed by fd.c). Currently, we only support the buffered-I/O
|
|
|
|
* aspect of stdio: a read or write of the low-level File occurs only
|
|
|
|
* when the buffer is filled or emptied. This is an even bigger win
|
|
|
|
* for virtual Files than for ordinary kernel files, since reducing the
|
|
|
|
* frequency with which a virtual File is touched reduces "thrashing"
|
|
|
|
* of opening/closing file descriptors.
|
|
|
|
*
|
|
|
|
* Note that BufFile structs are allocated with palloc(), and therefore
|
2017-11-25 19:19:43 +01:00
|
|
|
* will go away automatically at query/transaction end. Since the underlying
|
|
|
|
* virtual Files are made with OpenTemporaryFile, all resources for
|
1999-10-13 17:02:32 +02:00
|
|
|
* the file are certain to be cleaned up even if processing is aborted
|
2013-11-01 21:09:48 +01:00
|
|
|
* by ereport(ERROR). The data structures required are made in the
|
|
|
|
* palloc context that was current when the BufFile was created, and
|
|
|
|
* any external resources such as temp files are owned by the ResourceOwner
|
|
|
|
* that was current at that time.
|
1999-10-13 17:02:32 +02:00
|
|
|
*
|
|
|
|
* BufFile also supports temporary files that exceed the OS file size limit
|
2014-05-06 18:12:18 +02:00
|
|
|
* (by opening multiple fd.c temporary files). This is an essential feature
|
1999-10-16 21:49:28 +02:00
|
|
|
* for sorts and hashjoins on large amounts of data.
|
2017-12-02 01:30:56 +01:00
|
|
|
*
|
|
|
|
* BufFile supports temporary files that can be made read-only and shared with
|
|
|
|
* other backends, as infrastructure for parallel execution. Such files need
|
|
|
|
* to be created as a member of a SharedFileSet that all participants are
|
|
|
|
* attached to.
|
1999-10-13 17:02:32 +02:00
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "postgres.h"
|
|
|
|
|
2019-05-18 19:51:16 +02:00
|
|
|
#include "commands/tablespace.h"
|
2009-12-15 05:57:48 +01:00
|
|
|
#include "executor/instrument.h"
|
2017-12-02 01:30:56 +01:00
|
|
|
#include "miscadmin.h"
|
Create and use wait events for read, write, and fsync operations.
Previous commits, notably 53be0b1add7064ca5db3cd884302dfc3268d884e and
6f3bd98ebfc008cbd676da777bb0b2376c4c4bfa, made it possible to see from
pg_stat_activity when a backend was stuck waiting for another backend,
but it's also fairly common for a backend to be stuck waiting for an
I/O. Add wait events for those operations, too.
Rushabh Lathia, with further hacking by me. Reviewed and tested by
Michael Paquier, Amit Kapila, Rajkumar Raghuwanshi, and Rahila Syed.
Discussion: http://postgr.es/m/CAGPqQf0LsYHXREPAZqYGVkDqHSyjf=KsD=k0GTVPAuzyThh-VQ@mail.gmail.com
2017-03-18 12:43:01 +01:00
|
|
|
#include "pgstat.h"
|
2000-06-15 05:33:12 +02:00
|
|
|
#include "storage/fd.h"
|
1999-10-13 17:02:32 +02:00
|
|
|
#include "storage/buffile.h"
|
2008-09-17 15:15:55 +02:00
|
|
|
#include "storage/buf_internals.h"
|
2013-11-01 21:09:48 +01:00
|
|
|
#include "utils/resowner.h"
|
1999-10-13 17:02:32 +02:00
|
|
|
|
|
|
|
/*
|
2008-05-02 03:08:27 +02:00
|
|
|
* We break BufFiles into gigabyte-sized segments, regardless of RELSEG_SIZE.
|
2017-11-25 19:19:43 +01:00
|
|
|
* The reason is that we'd like large BufFiles to be spread across multiple
|
|
|
|
* tablespaces when available.
|
1999-10-13 17:02:32 +02:00
|
|
|
*/
|
2008-03-10 21:06:27 +01:00
|
|
|
#define MAX_PHYSICAL_FILESIZE 0x40000000
|
|
|
|
#define BUFFILE_SEG_SIZE (MAX_PHYSICAL_FILESIZE / BLCKSZ)
|
1999-10-13 17:02:32 +02:00
|
|
|
|
|
|
|
/*
|
1999-10-16 21:49:28 +02:00
|
|
|
* This data structure represents a buffered file that consists of one or
|
|
|
|
* more physical files (each accessed through a virtual file descriptor
|
|
|
|
* managed by fd.c).
|
1999-10-13 17:02:32 +02:00
|
|
|
*/
|
1999-10-16 21:49:28 +02:00
|
|
|
struct BufFile
|
1999-10-13 17:02:32 +02:00
|
|
|
{
|
|
|
|
int numFiles; /* number of physical files in set */
|
|
|
|
/* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
|
|
|
|
File *files; /* palloc'd array with numFiles entries */
|
|
|
|
|
2003-04-29 05:21:30 +02:00
|
|
|
bool isInterXact; /* keep open over transactions? */
|
1999-10-13 17:02:32 +02:00
|
|
|
bool dirty; /* does buffer need to be written? */
|
2017-12-02 01:30:56 +01:00
|
|
|
bool readOnly; /* has the file been set to read only? */
|
|
|
|
|
|
|
|
SharedFileSet *fileset; /* space for segment files if shared */
|
|
|
|
const char *name; /* name of this BufFile if shared */
|
2000-04-12 19:17:23 +02:00
|
|
|
|
2013-11-01 21:09:48 +01:00
|
|
|
/*
|
2014-05-06 18:12:18 +02:00
|
|
|
* resowner is the ResourceOwner to use for underlying temp files. (We
|
2013-11-01 21:09:48 +01:00
|
|
|
* don't need to remember the memory context we're using explicitly,
|
|
|
|
* because after creation we only repalloc our arrays larger.)
|
|
|
|
*/
|
|
|
|
ResourceOwner resowner;
|
|
|
|
|
1999-10-13 17:02:32 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* "current pos" is position of start of buffer within the logical file.
|
|
|
|
* Position as seen by user of BufFile is (curFile, curOffset + pos).
|
1999-10-13 17:02:32 +02:00
|
|
|
*/
|
|
|
|
int curFile; /* file index (0..n) part of current pos */
|
2008-03-10 21:06:27 +01:00
|
|
|
off_t curOffset; /* offset part of current pos */
|
1999-10-13 17:02:32 +02:00
|
|
|
int pos; /* next read/write position in buffer */
|
|
|
|
int nbytes; /* total # of valid bytes in buffer */
|
2018-09-01 21:27:12 +02:00
|
|
|
PGAlignedBlock buffer;
|
1999-10-13 17:02:32 +02:00
|
|
|
};
|
|
|
|
|
2018-06-16 07:21:08 +02:00
|
|
|
static BufFile *makeBufFileCommon(int nfiles);
|
1999-10-16 21:49:28 +02:00
|
|
|
static BufFile *makeBufFile(File firstfile);
|
|
|
|
static void extendBufFile(BufFile *file);
|
1999-10-13 17:02:32 +02:00
|
|
|
static void BufFileLoadBuffer(BufFile *file);
|
|
|
|
static void BufFileDumpBuffer(BufFile *file);
|
|
|
|
static int BufFileFlush(BufFile *file);
|
2017-12-02 01:30:56 +01:00
|
|
|
static File MakeNewSharedSegment(BufFile *file, int segment);
|
1999-10-13 17:02:32 +02:00
|
|
|
|
|
|
|
/*
|
2018-06-16 07:21:08 +02:00
|
|
|
* Create BufFile and perform the common initialization.
|
1999-10-13 17:02:32 +02:00
|
|
|
*/
|
1999-10-16 21:49:28 +02:00
|
|
|
static BufFile *
|
2018-06-16 07:21:08 +02:00
|
|
|
makeBufFileCommon(int nfiles)
|
1999-10-13 17:02:32 +02:00
|
|
|
{
|
2000-04-12 19:17:23 +02:00
|
|
|
BufFile *file = (BufFile *) palloc(sizeof(BufFile));
|
1999-10-13 17:02:32 +02:00
|
|
|
|
2018-06-16 07:21:08 +02:00
|
|
|
file->numFiles = nfiles;
|
2007-06-03 19:08:34 +02:00
|
|
|
file->isInterXact = false;
|
1999-10-16 21:49:28 +02:00
|
|
|
file->dirty = false;
|
2013-11-01 21:09:48 +01:00
|
|
|
file->resowner = CurrentResourceOwner;
|
1999-10-16 21:49:28 +02:00
|
|
|
file->curFile = 0;
|
|
|
|
file->curOffset = 0L;
|
|
|
|
file->pos = 0;
|
|
|
|
file->nbytes = 0;
|
2018-06-16 07:21:08 +02:00
|
|
|
|
|
|
|
return file;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a BufFile given the first underlying physical file.
|
|
|
|
* NOTE: caller must set isInterXact if appropriate.
|
|
|
|
*/
|
|
|
|
static BufFile *
|
|
|
|
makeBufFile(File firstfile)
|
|
|
|
{
|
|
|
|
BufFile *file = makeBufFileCommon(1);
|
|
|
|
|
|
|
|
file->files = (File *) palloc(sizeof(File));
|
|
|
|
file->files[0] = firstfile;
|
2017-12-02 01:30:56 +01:00
|
|
|
file->readOnly = false;
|
|
|
|
file->fileset = NULL;
|
|
|
|
file->name = NULL;
|
1999-10-13 17:02:32 +02:00
|
|
|
|
|
|
|
return file;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add another component temp file.
|
|
|
|
*/
|
|
|
|
static void
|
1999-10-16 21:49:28 +02:00
|
|
|
extendBufFile(BufFile *file)
|
1999-10-13 17:02:32 +02:00
|
|
|
{
|
|
|
|
File pfile;
|
2013-11-01 21:09:48 +01:00
|
|
|
ResourceOwner oldowner;
|
|
|
|
|
|
|
|
/* Be sure to associate the file with the BufFile's resource owner */
|
|
|
|
oldowner = CurrentResourceOwner;
|
|
|
|
CurrentResourceOwner = file->resowner;
|
1999-10-13 17:02:32 +02:00
|
|
|
|
2017-12-02 01:30:56 +01:00
|
|
|
if (file->fileset == NULL)
|
|
|
|
pfile = OpenTemporaryFile(file->isInterXact);
|
|
|
|
else
|
|
|
|
pfile = MakeNewSharedSegment(file, file->numFiles);
|
|
|
|
|
1999-10-13 17:02:32 +02:00
|
|
|
Assert(pfile >= 0);
|
|
|
|
|
2013-11-01 21:09:48 +01:00
|
|
|
CurrentResourceOwner = oldowner;
|
|
|
|
|
1999-10-13 17:02:32 +02:00
|
|
|
file->files = (File *) repalloc(file->files,
|
2000-04-12 19:17:23 +02:00
|
|
|
(file->numFiles + 1) * sizeof(File));
|
1999-10-13 17:02:32 +02:00
|
|
|
file->files[file->numFiles] = pfile;
|
|
|
|
file->numFiles++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a BufFile for a new temporary file (which will expand to become
|
|
|
|
* multiple temporary files if more than MAX_PHYSICAL_FILESIZE bytes are
|
|
|
|
* written to it).
|
2003-04-29 05:21:30 +02:00
|
|
|
*
|
2007-06-03 19:08:34 +02:00
|
|
|
* If interXact is true, the temp file will not be automatically deleted
|
2007-06-07 21:19:57 +02:00
|
|
|
* at end of transaction.
|
2007-06-03 19:08:34 +02:00
|
|
|
*
|
2003-04-29 05:21:30 +02:00
|
|
|
* Note: if interXact is true, the caller had better be calling us in a
|
2013-11-01 21:09:48 +01:00
|
|
|
* memory context, and with a resource owner, that will survive across
|
|
|
|
* transaction boundaries.
|
1999-10-13 17:02:32 +02:00
|
|
|
*/
|
2001-10-25 07:50:21 +02:00
|
|
|
BufFile *
|
2007-06-07 21:19:57 +02:00
|
|
|
BufFileCreateTemp(bool interXact)
|
1999-10-13 17:02:32 +02:00
|
|
|
{
|
1999-10-16 21:49:28 +02:00
|
|
|
BufFile *file;
|
1999-10-13 17:02:32 +02:00
|
|
|
File pfile;
|
|
|
|
|
2019-05-18 19:51:16 +02:00
|
|
|
/*
|
|
|
|
* Ensure that temp tablespaces are set up for OpenTemporaryFile to use.
|
|
|
|
* Possibly the caller will have done this already, but it seems useful to
|
|
|
|
* double-check here. Failure to do this at all would result in the temp
|
|
|
|
* files always getting placed in the default tablespace, which is a
|
|
|
|
* pretty hard-to-detect bug. Callers may prefer to do it earlier if they
|
|
|
|
* want to be sure that any required catalog access is done in some other
|
|
|
|
* resource context.
|
|
|
|
*/
|
|
|
|
PrepareTempTablespaces();
|
|
|
|
|
2007-06-07 21:19:57 +02:00
|
|
|
pfile = OpenTemporaryFile(interXact);
|
1999-10-13 17:02:32 +02:00
|
|
|
Assert(pfile >= 0);
|
|
|
|
|
1999-10-16 21:49:28 +02:00
|
|
|
file = makeBufFile(pfile);
|
2003-04-29 05:21:30 +02:00
|
|
|
file->isInterXact = interXact;
|
1999-10-13 17:02:32 +02:00
|
|
|
|
1999-10-16 21:49:28 +02:00
|
|
|
return file;
|
1999-10-13 17:02:32 +02:00
|
|
|
}
|
|
|
|
|
2017-12-02 01:30:56 +01:00
|
|
|
/*
|
|
|
|
* Build the name for a given segment of a given BufFile.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
SharedSegmentName(char *name, const char *buffile_name, int segment)
|
|
|
|
{
|
|
|
|
snprintf(name, MAXPGPATH, "%s.%d", buffile_name, segment);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a new segment file backing a shared BufFile.
|
|
|
|
*/
|
|
|
|
static File
|
|
|
|
MakeNewSharedSegment(BufFile *buffile, int segment)
|
|
|
|
{
|
|
|
|
char name[MAXPGPATH];
|
|
|
|
File file;
|
|
|
|
|
2017-12-13 21:51:32 +01:00
|
|
|
/*
|
|
|
|
* It is possible that there are files left over from before a crash
|
2018-04-26 20:47:16 +02:00
|
|
|
* restart with the same name. In order for BufFileOpenShared() not to
|
|
|
|
* get confused about how many segments there are, we'll unlink the next
|
|
|
|
* segment number if it already exists.
|
2017-12-13 21:51:32 +01:00
|
|
|
*/
|
|
|
|
SharedSegmentName(name, buffile->name, segment + 1);
|
|
|
|
SharedFileSetDelete(buffile->fileset, name, true);
|
|
|
|
|
|
|
|
/* Create the new segment. */
|
2017-12-02 01:30:56 +01:00
|
|
|
SharedSegmentName(name, buffile->name, segment);
|
|
|
|
file = SharedFileSetCreate(buffile->fileset, name);
|
|
|
|
|
|
|
|
/* SharedFileSetCreate would've errored out */
|
|
|
|
Assert(file > 0);
|
|
|
|
|
|
|
|
return file;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a BufFile that can be discovered and opened read-only by other
|
|
|
|
* backends that are attached to the same SharedFileSet using the same name.
|
|
|
|
*
|
|
|
|
* The naming scheme for shared BufFiles is left up to the calling code. The
|
|
|
|
* name will appear as part of one or more filenames on disk, and might
|
|
|
|
* provide clues to administrators about which subsystem is generating
|
|
|
|
* temporary file data. Since each SharedFileSet object is backed by one or
|
|
|
|
* more uniquely named temporary directory, names don't conflict with
|
|
|
|
* unrelated SharedFileSet objects.
|
|
|
|
*/
|
|
|
|
BufFile *
|
|
|
|
BufFileCreateShared(SharedFileSet *fileset, const char *name)
|
|
|
|
{
|
|
|
|
BufFile *file;
|
|
|
|
|
2018-06-16 07:21:08 +02:00
|
|
|
file = makeBufFileCommon(1);
|
2017-12-02 01:30:56 +01:00
|
|
|
file->fileset = fileset;
|
|
|
|
file->name = pstrdup(name);
|
|
|
|
file->files = (File *) palloc(sizeof(File));
|
|
|
|
file->files[0] = MakeNewSharedSegment(file, 0);
|
|
|
|
file->readOnly = false;
|
|
|
|
|
|
|
|
return file;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Open a file that was previously created in another backend (or this one)
|
|
|
|
* with BufFileCreateShared in the same SharedFileSet using the same name.
|
|
|
|
* The backend that created the file must have called BufFileClose() or
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
* BufFileExportShared() to make sure that it is ready to be opened by other
|
2017-12-02 01:30:56 +01:00
|
|
|
* backends and render it read-only.
|
|
|
|
*/
|
|
|
|
BufFile *
|
|
|
|
BufFileOpenShared(SharedFileSet *fileset, const char *name)
|
|
|
|
{
|
2018-06-15 09:32:59 +02:00
|
|
|
BufFile *file;
|
2017-12-02 01:30:56 +01:00
|
|
|
char segment_name[MAXPGPATH];
|
|
|
|
Size capacity = 16;
|
2018-06-15 09:32:59 +02:00
|
|
|
File *files;
|
2017-12-02 01:30:56 +01:00
|
|
|
int nfiles = 0;
|
|
|
|
|
|
|
|
files = palloc(sizeof(File) * capacity);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't know how many segments there are, so we'll probe the
|
|
|
|
* filesystem to find out.
|
|
|
|
*/
|
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
/* See if we need to expand our file segment array. */
|
|
|
|
if (nfiles + 1 > capacity)
|
|
|
|
{
|
|
|
|
capacity *= 2;
|
|
|
|
files = repalloc(files, sizeof(File) * capacity);
|
|
|
|
}
|
|
|
|
/* Try to load a segment. */
|
|
|
|
SharedSegmentName(segment_name, name, nfiles);
|
|
|
|
files[nfiles] = SharedFileSetOpen(fileset, segment_name);
|
|
|
|
if (files[nfiles] <= 0)
|
|
|
|
break;
|
|
|
|
++nfiles;
|
|
|
|
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we didn't find any files at all, then no BufFile exists with this
|
|
|
|
* name.
|
|
|
|
*/
|
|
|
|
if (nfiles == 0)
|
2017-12-13 21:51:32 +01:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode_for_file_access(),
|
Have BufFileSize() ereport() on FileSize() failure.
Move the responsibility for checking for and reporting a failure from
the only current BufFileSize() caller, logtape.c, to BufFileSize()
itself. Code within buffile.c is generally responsible for interfacing
with fd.c to report irrecoverable failures. This seems like a
convention that's worth sticking to.
Reorganizing things this way makes it easy to make the error message
raised in the event of BufFileSize() failure descriptive of the
underlying problem. We're now clear on the distinction between
temporary file name and BufFile name, and can show errno, confident that
its value actually relates to the error being reported. In passing, an
existing, similar buffile.c ereport() + errcode_for_file_access() site
is changed to follow the same conventions.
The API of the function BufFileSize() is changed by this commit, despite
already being in a stable release (Postgres 11). This seems acceptable,
since the BufFileSize() ABI was changed by commit aa551830421, which
hasn't made it into a point release yet. Besides, it's difficult to
imagine a third party BufFileSize() caller not just raising an error
anyway, since BufFile state should be considered corrupt when
BufFileSize() fails.
Per complaint from Tom Lane.
Discussion: https://postgr.es/m/26974.1540826748@sss.pgh.pa.us
Backpatch: 11-, where shared BufFiles were introduced.
2018-11-28 23:42:54 +01:00
|
|
|
errmsg("could not open temporary file \"%s\" from BufFile \"%s\": %m",
|
|
|
|
segment_name, name)));
|
2017-12-02 01:30:56 +01:00
|
|
|
|
2018-06-16 07:21:08 +02:00
|
|
|
file = makeBufFileCommon(nfiles);
|
2017-12-02 01:30:56 +01:00
|
|
|
file->files = files;
|
|
|
|
file->readOnly = true; /* Can't write to files opened this way */
|
|
|
|
file->fileset = fileset;
|
|
|
|
file->name = pstrdup(name);
|
|
|
|
|
|
|
|
return file;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Delete a BufFile that was created by BufFileCreateShared in the given
|
|
|
|
* SharedFileSet using the given name.
|
|
|
|
*
|
|
|
|
* It is not necessary to delete files explicitly with this function. It is
|
|
|
|
* provided only as a way to delete files proactively, rather than waiting for
|
|
|
|
* the SharedFileSet to be cleaned up.
|
|
|
|
*
|
|
|
|
* Only one backend should attempt to delete a given name, and should know
|
|
|
|
* that it exists and has been exported or closed.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
BufFileDeleteShared(SharedFileSet *fileset, const char *name)
|
|
|
|
{
|
|
|
|
char segment_name[MAXPGPATH];
|
|
|
|
int segment = 0;
|
|
|
|
bool found = false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't know how many segments the file has. We'll keep deleting
|
|
|
|
* until we run out. If we don't manage to find even an initial segment,
|
|
|
|
* raise an error.
|
|
|
|
*/
|
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
SharedSegmentName(segment_name, name, segment);
|
|
|
|
if (!SharedFileSetDelete(fileset, segment_name, true))
|
|
|
|
break;
|
|
|
|
found = true;
|
|
|
|
++segment;
|
|
|
|
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!found)
|
|
|
|
elog(ERROR, "could not delete unknown shared BufFile \"%s\"", name);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* BufFileExportShared --- flush and make read-only, in preparation for sharing.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
BufFileExportShared(BufFile *file)
|
|
|
|
{
|
|
|
|
/* Must be a file belonging to a SharedFileSet. */
|
|
|
|
Assert(file->fileset != NULL);
|
|
|
|
|
|
|
|
/* It's probably a bug if someone calls this twice. */
|
|
|
|
Assert(!file->readOnly);
|
|
|
|
|
|
|
|
BufFileFlush(file);
|
|
|
|
file->readOnly = true;
|
|
|
|
}
|
|
|
|
|
1999-10-13 17:02:32 +02:00
|
|
|
/*
|
|
|
|
* Close a BufFile
|
|
|
|
*
|
|
|
|
* Like fclose(), this also implicitly FileCloses the underlying File.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
BufFileClose(BufFile *file)
|
|
|
|
{
|
2000-04-12 19:17:23 +02:00
|
|
|
int i;
|
1999-10-16 21:49:28 +02:00
|
|
|
|
1999-10-13 17:02:32 +02:00
|
|
|
/* flush any unwritten data */
|
|
|
|
BufFileFlush(file);
|
2017-11-25 19:19:43 +01:00
|
|
|
/* close and delete the underlying file(s) */
|
1999-10-16 21:49:28 +02:00
|
|
|
for (i = 0; i < file->numFiles; i++)
|
|
|
|
FileClose(file->files[i]);
|
1999-10-13 17:02:32 +02:00
|
|
|
/* release the buffer space */
|
1999-10-16 21:49:28 +02:00
|
|
|
pfree(file->files);
|
1999-10-13 17:02:32 +02:00
|
|
|
pfree(file);
|
|
|
|
}
|
|
|
|
|
1999-10-16 21:49:28 +02:00
|
|
|
/*
|
|
|
|
* BufFileLoadBuffer
|
1999-10-13 17:02:32 +02:00
|
|
|
*
|
|
|
|
* Load some data into buffer, if possible, starting from curOffset.
|
|
|
|
* At call, must have dirty = false, pos and nbytes = 0.
|
|
|
|
* On exit, nbytes is number of bytes loaded.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
BufFileLoadBuffer(BufFile *file)
|
|
|
|
{
|
2000-04-12 19:17:23 +02:00
|
|
|
File thisfile;
|
1999-10-13 17:02:32 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Advance to next component file if necessary and possible.
|
|
|
|
*/
|
|
|
|
if (file->curOffset >= MAX_PHYSICAL_FILESIZE &&
|
2000-04-12 19:17:23 +02:00
|
|
|
file->curFile + 1 < file->numFiles)
|
1999-10-13 17:02:32 +02:00
|
|
|
{
|
|
|
|
file->curFile++;
|
|
|
|
file->curOffset = 0L;
|
|
|
|
}
|
2000-04-12 19:17:23 +02:00
|
|
|
|
1999-10-16 21:49:28 +02:00
|
|
|
/*
|
|
|
|
* Read whatever we can get, up to a full bufferload.
|
|
|
|
*/
|
2018-11-06 21:51:50 +01:00
|
|
|
thisfile = file->files[file->curFile];
|
Create and use wait events for read, write, and fsync operations.
Previous commits, notably 53be0b1add7064ca5db3cd884302dfc3268d884e and
6f3bd98ebfc008cbd676da777bb0b2376c4c4bfa, made it possible to see from
pg_stat_activity when a backend was stuck waiting for another backend,
but it's also fairly common for a backend to be stuck waiting for an
I/O. Add wait events for those operations, too.
Rushabh Lathia, with further hacking by me. Reviewed and tested by
Michael Paquier, Amit Kapila, Rajkumar Raghuwanshi, and Rahila Syed.
Discussion: http://postgr.es/m/CAGPqQf0LsYHXREPAZqYGVkDqHSyjf=KsD=k0GTVPAuzyThh-VQ@mail.gmail.com
2017-03-18 12:43:01 +01:00
|
|
|
file->nbytes = FileRead(thisfile,
|
2018-09-01 21:27:12 +02:00
|
|
|
file->buffer.data,
|
Create and use wait events for read, write, and fsync operations.
Previous commits, notably 53be0b1add7064ca5db3cd884302dfc3268d884e and
6f3bd98ebfc008cbd676da777bb0b2376c4c4bfa, made it possible to see from
pg_stat_activity when a backend was stuck waiting for another backend,
but it's also fairly common for a backend to be stuck waiting for an
I/O. Add wait events for those operations, too.
Rushabh Lathia, with further hacking by me. Reviewed and tested by
Michael Paquier, Amit Kapila, Rajkumar Raghuwanshi, and Rahila Syed.
Discussion: http://postgr.es/m/CAGPqQf0LsYHXREPAZqYGVkDqHSyjf=KsD=k0GTVPAuzyThh-VQ@mail.gmail.com
2017-03-18 12:43:01 +01:00
|
|
|
sizeof(file->buffer),
|
2018-11-06 21:51:50 +01:00
|
|
|
file->curOffset,
|
Create and use wait events for read, write, and fsync operations.
Previous commits, notably 53be0b1add7064ca5db3cd884302dfc3268d884e and
6f3bd98ebfc008cbd676da777bb0b2376c4c4bfa, made it possible to see from
pg_stat_activity when a backend was stuck waiting for another backend,
but it's also fairly common for a backend to be stuck waiting for an
I/O. Add wait events for those operations, too.
Rushabh Lathia, with further hacking by me. Reviewed and tested by
Michael Paquier, Amit Kapila, Rajkumar Raghuwanshi, and Rahila Syed.
Discussion: http://postgr.es/m/CAGPqQf0LsYHXREPAZqYGVkDqHSyjf=KsD=k0GTVPAuzyThh-VQ@mail.gmail.com
2017-03-18 12:43:01 +01:00
|
|
|
WAIT_EVENT_BUFFILE_READ);
|
1999-10-13 17:02:32 +02:00
|
|
|
if (file->nbytes < 0)
|
|
|
|
file->nbytes = 0;
|
|
|
|
/* we choose not to advance curOffset here */
|
2008-09-17 15:15:55 +02:00
|
|
|
|
2017-10-31 10:24:41 +01:00
|
|
|
if (file->nbytes > 0)
|
|
|
|
pgBufferUsage.temp_blks_read++;
|
1999-10-13 17:02:32 +02:00
|
|
|
}
|
|
|
|
|
1999-10-16 21:49:28 +02:00
|
|
|
/*
|
|
|
|
* BufFileDumpBuffer
|
1999-10-13 17:02:32 +02:00
|
|
|
*
|
|
|
|
* Dump buffer contents starting at curOffset.
|
|
|
|
* At call, should have dirty = true, nbytes > 0.
|
|
|
|
* On exit, dirty is cleared if successful write, and curOffset is advanced.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
BufFileDumpBuffer(BufFile *file)
|
|
|
|
{
|
|
|
|
int wpos = 0;
|
|
|
|
int bytestowrite;
|
|
|
|
File thisfile;
|
|
|
|
|
|
|
|
/*
|
2000-04-12 19:17:23 +02:00
|
|
|
* Unlike BufFileLoadBuffer, we must dump the whole buffer even if it
|
|
|
|
* crosses a component-file boundary; so we need a loop.
|
1999-10-13 17:02:32 +02:00
|
|
|
*/
|
|
|
|
while (wpos < file->nbytes)
|
|
|
|
{
|
2017-11-17 02:52:57 +01:00
|
|
|
off_t availbytes;
|
|
|
|
|
1999-10-13 17:02:32 +02:00
|
|
|
/*
|
|
|
|
* Advance to next component file if necessary and possible.
|
|
|
|
*/
|
2017-11-17 02:52:57 +01:00
|
|
|
if (file->curOffset >= MAX_PHYSICAL_FILESIZE)
|
1999-10-13 17:02:32 +02:00
|
|
|
{
|
2000-04-12 19:17:23 +02:00
|
|
|
while (file->curFile + 1 >= file->numFiles)
|
1999-10-16 21:49:28 +02:00
|
|
|
extendBufFile(file);
|
1999-10-13 17:02:32 +02:00
|
|
|
file->curFile++;
|
|
|
|
file->curOffset = 0L;
|
|
|
|
}
|
2000-04-12 19:17:23 +02:00
|
|
|
|
1999-10-13 17:02:32 +02:00
|
|
|
/*
|
2017-11-25 19:19:43 +01:00
|
|
|
* Determine how much we need to write into this file.
|
1999-10-13 17:02:32 +02:00
|
|
|
*/
|
|
|
|
bytestowrite = file->nbytes - wpos;
|
2017-11-17 02:52:57 +01:00
|
|
|
availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset;
|
1999-10-13 17:02:32 +02:00
|
|
|
|
2017-11-17 02:52:57 +01:00
|
|
|
if ((off_t) bytestowrite > availbytes)
|
|
|
|
bytestowrite = (int) availbytes;
|
2000-04-12 19:17:23 +02:00
|
|
|
|
1999-10-16 21:49:28 +02:00
|
|
|
thisfile = file->files[file->curFile];
|
Create and use wait events for read, write, and fsync operations.
Previous commits, notably 53be0b1add7064ca5db3cd884302dfc3268d884e and
6f3bd98ebfc008cbd676da777bb0b2376c4c4bfa, made it possible to see from
pg_stat_activity when a backend was stuck waiting for another backend,
but it's also fairly common for a backend to be stuck waiting for an
I/O. Add wait events for those operations, too.
Rushabh Lathia, with further hacking by me. Reviewed and tested by
Michael Paquier, Amit Kapila, Rajkumar Raghuwanshi, and Rahila Syed.
Discussion: http://postgr.es/m/CAGPqQf0LsYHXREPAZqYGVkDqHSyjf=KsD=k0GTVPAuzyThh-VQ@mail.gmail.com
2017-03-18 12:43:01 +01:00
|
|
|
bytestowrite = FileWrite(thisfile,
|
2018-09-01 21:27:12 +02:00
|
|
|
file->buffer.data + wpos,
|
Create and use wait events for read, write, and fsync operations.
Previous commits, notably 53be0b1add7064ca5db3cd884302dfc3268d884e and
6f3bd98ebfc008cbd676da777bb0b2376c4c4bfa, made it possible to see from
pg_stat_activity when a backend was stuck waiting for another backend,
but it's also fairly common for a backend to be stuck waiting for an
I/O. Add wait events for those operations, too.
Rushabh Lathia, with further hacking by me. Reviewed and tested by
Michael Paquier, Amit Kapila, Rajkumar Raghuwanshi, and Rahila Syed.
Discussion: http://postgr.es/m/CAGPqQf0LsYHXREPAZqYGVkDqHSyjf=KsD=k0GTVPAuzyThh-VQ@mail.gmail.com
2017-03-18 12:43:01 +01:00
|
|
|
bytestowrite,
|
2018-11-06 21:51:50 +01:00
|
|
|
file->curOffset,
|
Create and use wait events for read, write, and fsync operations.
Previous commits, notably 53be0b1add7064ca5db3cd884302dfc3268d884e and
6f3bd98ebfc008cbd676da777bb0b2376c4c4bfa, made it possible to see from
pg_stat_activity when a backend was stuck waiting for another backend,
but it's also fairly common for a backend to be stuck waiting for an
I/O. Add wait events for those operations, too.
Rushabh Lathia, with further hacking by me. Reviewed and tested by
Michael Paquier, Amit Kapila, Rajkumar Raghuwanshi, and Rahila Syed.
Discussion: http://postgr.es/m/CAGPqQf0LsYHXREPAZqYGVkDqHSyjf=KsD=k0GTVPAuzyThh-VQ@mail.gmail.com
2017-03-18 12:43:01 +01:00
|
|
|
WAIT_EVENT_BUFFILE_WRITE);
|
1999-10-13 17:02:32 +02:00
|
|
|
if (bytestowrite <= 0)
|
|
|
|
return; /* failed to write */
|
|
|
|
file->curOffset += bytestowrite;
|
|
|
|
wpos += bytestowrite;
|
2008-09-17 15:15:55 +02:00
|
|
|
|
2009-12-15 05:57:48 +01:00
|
|
|
pgBufferUsage.temp_blks_written++;
|
1999-10-13 17:02:32 +02:00
|
|
|
}
|
|
|
|
file->dirty = false;
|
2000-04-12 19:17:23 +02:00
|
|
|
|
1999-10-13 17:02:32 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* At this point, curOffset has been advanced to the end of the buffer,
|
|
|
|
* ie, its original value + nbytes. We need to make it point to the
|
|
|
|
* logical file position, ie, original value + pos, in case that is less
|
|
|
|
* (as could happen due to a small backwards seek in a dirty buffer!)
|
1999-10-13 17:02:32 +02:00
|
|
|
*/
|
|
|
|
file->curOffset -= (file->nbytes - file->pos);
|
|
|
|
if (file->curOffset < 0) /* handle possible segment crossing */
|
|
|
|
{
|
|
|
|
file->curFile--;
|
|
|
|
Assert(file->curFile >= 0);
|
|
|
|
file->curOffset += MAX_PHYSICAL_FILESIZE;
|
|
|
|
}
|
2000-04-12 19:17:23 +02:00
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Now we can set the buffer empty without changing the logical position
|
2000-04-12 19:17:23 +02:00
|
|
|
*/
|
1999-10-13 17:02:32 +02:00
|
|
|
file->pos = 0;
|
|
|
|
file->nbytes = 0;
|
|
|
|
}
|
|
|
|
|
1999-10-16 21:49:28 +02:00
|
|
|
/*
|
|
|
|
* BufFileRead
|
1999-10-13 17:02:32 +02:00
|
|
|
*
|
|
|
|
* Like fread() except we assume 1-byte element size.
|
|
|
|
*/
|
|
|
|
size_t
|
|
|
|
BufFileRead(BufFile *file, void *ptr, size_t size)
|
|
|
|
{
|
|
|
|
size_t nread = 0;
|
|
|
|
size_t nthistime;
|
|
|
|
|
|
|
|
if (file->dirty)
|
|
|
|
{
|
|
|
|
if (BufFileFlush(file) != 0)
|
|
|
|
return 0; /* could not flush... */
|
2000-04-12 19:17:23 +02:00
|
|
|
Assert(!file->dirty);
|
1999-10-13 17:02:32 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
while (size > 0)
|
|
|
|
{
|
|
|
|
if (file->pos >= file->nbytes)
|
|
|
|
{
|
|
|
|
/* Try to load more data into buffer. */
|
|
|
|
file->curOffset += file->pos;
|
|
|
|
file->pos = 0;
|
|
|
|
file->nbytes = 0;
|
|
|
|
BufFileLoadBuffer(file);
|
|
|
|
if (file->nbytes <= 0)
|
|
|
|
break; /* no more data available */
|
|
|
|
}
|
|
|
|
|
|
|
|
nthistime = file->nbytes - file->pos;
|
|
|
|
if (nthistime > size)
|
|
|
|
nthistime = size;
|
|
|
|
Assert(nthistime > 0);
|
|
|
|
|
2018-09-01 21:27:12 +02:00
|
|
|
memcpy(ptr, file->buffer.data + file->pos, nthistime);
|
1999-10-13 17:02:32 +02:00
|
|
|
|
|
|
|
file->pos += nthistime;
|
|
|
|
ptr = (void *) ((char *) ptr + nthistime);
|
|
|
|
size -= nthistime;
|
|
|
|
nread += nthistime;
|
|
|
|
}
|
|
|
|
|
|
|
|
return nread;
|
|
|
|
}
|
|
|
|
|
1999-10-16 21:49:28 +02:00
|
|
|
/*
|
|
|
|
* BufFileWrite
|
1999-10-13 17:02:32 +02:00
|
|
|
*
|
|
|
|
* Like fwrite() except we assume 1-byte element size.
|
|
|
|
*/
|
|
|
|
size_t
|
|
|
|
BufFileWrite(BufFile *file, void *ptr, size_t size)
|
|
|
|
{
|
|
|
|
size_t nwritten = 0;
|
|
|
|
size_t nthistime;
|
|
|
|
|
2017-12-02 01:30:56 +01:00
|
|
|
Assert(!file->readOnly);
|
|
|
|
|
1999-10-13 17:02:32 +02:00
|
|
|
while (size > 0)
|
|
|
|
{
|
|
|
|
if (file->pos >= BLCKSZ)
|
|
|
|
{
|
|
|
|
/* Buffer full, dump it out */
|
|
|
|
if (file->dirty)
|
|
|
|
{
|
|
|
|
BufFileDumpBuffer(file);
|
|
|
|
if (file->dirty)
|
|
|
|
break; /* I/O error */
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* Hmm, went directly from reading to writing? */
|
|
|
|
file->curOffset += file->pos;
|
|
|
|
file->pos = 0;
|
|
|
|
file->nbytes = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
nthistime = BLCKSZ - file->pos;
|
|
|
|
if (nthistime > size)
|
|
|
|
nthistime = size;
|
|
|
|
Assert(nthistime > 0);
|
|
|
|
|
2018-09-01 21:27:12 +02:00
|
|
|
memcpy(file->buffer.data + file->pos, ptr, nthistime);
|
1999-10-13 17:02:32 +02:00
|
|
|
|
|
|
|
file->dirty = true;
|
|
|
|
file->pos += nthistime;
|
|
|
|
if (file->nbytes < file->pos)
|
|
|
|
file->nbytes = file->pos;
|
|
|
|
ptr = (void *) ((char *) ptr + nthistime);
|
|
|
|
size -= nthistime;
|
|
|
|
nwritten += nthistime;
|
|
|
|
}
|
|
|
|
|
|
|
|
return nwritten;
|
|
|
|
}
|
|
|
|
|
1999-10-16 21:49:28 +02:00
|
|
|
/*
|
|
|
|
* BufFileFlush
|
1999-10-13 17:02:32 +02:00
|
|
|
*
|
|
|
|
* Like fflush()
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
BufFileFlush(BufFile *file)
|
|
|
|
{
|
|
|
|
if (file->dirty)
|
|
|
|
{
|
|
|
|
BufFileDumpBuffer(file);
|
|
|
|
if (file->dirty)
|
|
|
|
return EOF;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
1999-10-16 21:49:28 +02:00
|
|
|
/*
|
|
|
|
* BufFileSeek
|
1999-10-13 17:02:32 +02:00
|
|
|
*
|
1999-10-16 21:49:28 +02:00
|
|
|
* Like fseek(), except that target position needs two values in order to
|
2017-11-25 19:19:43 +01:00
|
|
|
* work when logical filesize exceeds maximum value representable by off_t.
|
|
|
|
* We do not support relative seeks across more than that, however.
|
1999-10-16 21:49:28 +02:00
|
|
|
*
|
|
|
|
* Result is 0 if OK, EOF if not. Logical position is not moved if an
|
|
|
|
* impossible seek is attempted.
|
1999-10-13 17:02:32 +02:00
|
|
|
*/
|
|
|
|
int
|
2008-03-10 21:06:27 +01:00
|
|
|
BufFileSeek(BufFile *file, int fileno, off_t offset, int whence)
|
1999-10-13 17:02:32 +02:00
|
|
|
{
|
2000-04-12 19:17:23 +02:00
|
|
|
int newFile;
|
2008-03-10 21:06:27 +01:00
|
|
|
off_t newOffset;
|
2000-04-12 19:17:23 +02:00
|
|
|
|
1999-10-13 17:02:32 +02:00
|
|
|
switch (whence)
|
|
|
|
{
|
|
|
|
case SEEK_SET:
|
1999-10-19 04:34:45 +02:00
|
|
|
if (fileno < 0)
|
1999-10-13 17:02:32 +02:00
|
|
|
return EOF;
|
|
|
|
newFile = fileno;
|
|
|
|
newOffset = offset;
|
|
|
|
break;
|
|
|
|
case SEEK_CUR:
|
2000-04-12 19:17:23 +02:00
|
|
|
|
1999-10-13 17:02:32 +02:00
|
|
|
/*
|
2000-04-12 19:17:23 +02:00
|
|
|
* Relative seek considers only the signed offset, ignoring
|
2005-10-15 04:49:52 +02:00
|
|
|
* fileno. Note that large offsets (> 1 gig) risk overflow in this
|
2008-03-10 21:06:27 +01:00
|
|
|
* add, unless we have 64-bit off_t.
|
1999-10-13 17:02:32 +02:00
|
|
|
*/
|
|
|
|
newFile = file->curFile;
|
|
|
|
newOffset = (file->curOffset + file->pos) + offset;
|
|
|
|
break;
|
|
|
|
#ifdef NOT_USED
|
|
|
|
case SEEK_END:
|
|
|
|
/* could be implemented, not needed currently */
|
|
|
|
break;
|
|
|
|
#endif
|
|
|
|
default:
|
2003-07-25 00:04:15 +02:00
|
|
|
elog(ERROR, "invalid whence: %d", whence);
|
1999-10-13 17:02:32 +02:00
|
|
|
return EOF;
|
|
|
|
}
|
|
|
|
while (newOffset < 0)
|
|
|
|
{
|
|
|
|
if (--newFile < 0)
|
|
|
|
return EOF;
|
|
|
|
newOffset += MAX_PHYSICAL_FILESIZE;
|
|
|
|
}
|
|
|
|
if (newFile == file->curFile &&
|
|
|
|
newOffset >= file->curOffset &&
|
|
|
|
newOffset <= file->curOffset + file->nbytes)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Seek is to a point within existing buffer; we can just adjust
|
2014-05-06 18:12:18 +02:00
|
|
|
* pos-within-buffer, without flushing buffer. Note this is OK
|
2000-04-12 19:17:23 +02:00
|
|
|
* whether reading or writing, but buffer remains dirty if we were
|
|
|
|
* writing.
|
1999-10-13 17:02:32 +02:00
|
|
|
*/
|
|
|
|
file->pos = (int) (newOffset - file->curOffset);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
/* Otherwise, must reposition buffer, so flush any dirty data */
|
|
|
|
if (BufFileFlush(file) != 0)
|
|
|
|
return EOF;
|
2000-04-12 19:17:23 +02:00
|
|
|
|
1999-10-19 04:34:45 +02:00
|
|
|
/*
|
2000-04-12 19:17:23 +02:00
|
|
|
* At this point and no sooner, check for seek past last segment. The
|
2005-10-15 04:49:52 +02:00
|
|
|
* above flush could have created a new segment, so checking sooner would
|
|
|
|
* not work (at least not with this code).
|
1999-10-19 04:34:45 +02:00
|
|
|
*/
|
2017-11-17 02:52:57 +01:00
|
|
|
|
|
|
|
/* convert seek to "start of next seg" to "end of last seg" */
|
|
|
|
if (newFile == file->numFiles && newOffset == 0)
|
1999-10-19 04:34:45 +02:00
|
|
|
{
|
2017-11-17 02:52:57 +01:00
|
|
|
newFile--;
|
|
|
|
newOffset = MAX_PHYSICAL_FILESIZE;
|
|
|
|
}
|
|
|
|
while (newOffset > MAX_PHYSICAL_FILESIZE)
|
|
|
|
{
|
|
|
|
if (++newFile >= file->numFiles)
|
|
|
|
return EOF;
|
|
|
|
newOffset -= MAX_PHYSICAL_FILESIZE;
|
1999-10-19 04:34:45 +02:00
|
|
|
}
|
|
|
|
if (newFile >= file->numFiles)
|
|
|
|
return EOF;
|
|
|
|
/* Seek is OK! */
|
1999-10-13 17:02:32 +02:00
|
|
|
file->curFile = newFile;
|
|
|
|
file->curOffset = newOffset;
|
|
|
|
file->pos = 0;
|
|
|
|
file->nbytes = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
1999-10-16 21:49:28 +02:00
|
|
|
void
|
2008-03-10 21:06:27 +01:00
|
|
|
BufFileTell(BufFile *file, int *fileno, off_t *offset)
|
1999-10-13 17:02:32 +02:00
|
|
|
{
|
|
|
|
*fileno = file->curFile;
|
|
|
|
*offset = file->curOffset + file->pos;
|
|
|
|
}
|
1999-10-16 21:49:28 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* BufFileSeekBlock --- block-oriented seek
|
|
|
|
*
|
|
|
|
* Performs absolute seek to the start of the n'th BLCKSZ-sized block of
|
|
|
|
* the file. Note that users of this interface will fail if their files
|
|
|
|
* exceed BLCKSZ * LONG_MAX bytes, but that is quite a lot; we don't work
|
|
|
|
* with tables bigger than that, either...
|
|
|
|
*
|
|
|
|
* Result is 0 if OK, EOF if not. Logical position is not moved if an
|
|
|
|
* impossible seek is attempted.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
BufFileSeekBlock(BufFile *file, long blknum)
|
|
|
|
{
|
|
|
|
return BufFileSeek(file,
|
2008-03-10 21:06:27 +01:00
|
|
|
(int) (blknum / BUFFILE_SEG_SIZE),
|
|
|
|
(off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ,
|
1999-10-16 21:49:28 +02:00
|
|
|
SEEK_SET);
|
|
|
|
}
|
|
|
|
|
2000-06-09 00:38:00 +02:00
|
|
|
#ifdef NOT_USED
|
1999-10-16 21:49:28 +02:00
|
|
|
/*
|
|
|
|
* BufFileTellBlock --- block-oriented tell
|
|
|
|
*
|
|
|
|
* Any fractional part of a block in the current seek position is ignored.
|
|
|
|
*/
|
|
|
|
long
|
|
|
|
BufFileTellBlock(BufFile *file)
|
|
|
|
{
|
2000-04-12 19:17:23 +02:00
|
|
|
long blknum;
|
1999-10-16 21:49:28 +02:00
|
|
|
|
|
|
|
blknum = (file->curOffset + file->pos) / BLCKSZ;
|
2008-03-10 21:06:27 +01:00
|
|
|
blknum += file->curFile * BUFFILE_SEG_SIZE;
|
1999-10-16 21:49:28 +02:00
|
|
|
return blknum;
|
|
|
|
}
|
2001-10-28 07:26:15 +01:00
|
|
|
|
2001-03-22 05:01:46 +01:00
|
|
|
#endif
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
|
|
|
|
/*
|
Have BufFileSize() ereport() on FileSize() failure.
Move the responsibility for checking for and reporting a failure from
the only current BufFileSize() caller, logtape.c, to BufFileSize()
itself. Code within buffile.c is generally responsible for interfacing
with fd.c to report irrecoverable failures. This seems like a
convention that's worth sticking to.
Reorganizing things this way makes it easy to make the error message
raised in the event of BufFileSize() failure descriptive of the
underlying problem. We're now clear on the distinction between
temporary file name and BufFile name, and can show errno, confident that
its value actually relates to the error being reported. In passing, an
existing, similar buffile.c ereport() + errcode_for_file_access() site
is changed to follow the same conventions.
The API of the function BufFileSize() is changed by this commit, despite
already being in a stable release (Postgres 11). This seems acceptable,
since the BufFileSize() ABI was changed by commit aa551830421, which
hasn't made it into a point release yet. Besides, it's difficult to
imagine a third party BufFileSize() caller not just raising an error
anyway, since BufFile state should be considered corrupt when
BufFileSize() fails.
Per complaint from Tom Lane.
Discussion: https://postgr.es/m/26974.1540826748@sss.pgh.pa.us
Backpatch: 11-, where shared BufFiles were introduced.
2018-11-28 23:42:54 +01:00
|
|
|
* Return the current shared BufFile size.
|
2018-05-02 16:23:13 +02:00
|
|
|
*
|
|
|
|
* Counts any holes left behind by BufFileAppend as part of the size.
|
Have BufFileSize() ereport() on FileSize() failure.
Move the responsibility for checking for and reporting a failure from
the only current BufFileSize() caller, logtape.c, to BufFileSize()
itself. Code within buffile.c is generally responsible for interfacing
with fd.c to report irrecoverable failures. This seems like a
convention that's worth sticking to.
Reorganizing things this way makes it easy to make the error message
raised in the event of BufFileSize() failure descriptive of the
underlying problem. We're now clear on the distinction between
temporary file name and BufFile name, and can show errno, confident that
its value actually relates to the error being reported. In passing, an
existing, similar buffile.c ereport() + errcode_for_file_access() site
is changed to follow the same conventions.
The API of the function BufFileSize() is changed by this commit, despite
already being in a stable release (Postgres 11). This seems acceptable,
since the BufFileSize() ABI was changed by commit aa551830421, which
hasn't made it into a point release yet. Besides, it's difficult to
imagine a third party BufFileSize() caller not just raising an error
anyway, since BufFile state should be considered corrupt when
BufFileSize() fails.
Per complaint from Tom Lane.
Discussion: https://postgr.es/m/26974.1540826748@sss.pgh.pa.us
Backpatch: 11-, where shared BufFiles were introduced.
2018-11-28 23:42:54 +01:00
|
|
|
* ereport()s on failure.
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
*/
|
2018-11-15 00:34:04 +01:00
|
|
|
int64
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
BufFileSize(BufFile *file)
|
|
|
|
{
|
2018-11-15 00:34:04 +01:00
|
|
|
int64 lastFileSize;
|
2018-05-02 16:23:13 +02:00
|
|
|
|
Have BufFileSize() ereport() on FileSize() failure.
Move the responsibility for checking for and reporting a failure from
the only current BufFileSize() caller, logtape.c, to BufFileSize()
itself. Code within buffile.c is generally responsible for interfacing
with fd.c to report irrecoverable failures. This seems like a
convention that's worth sticking to.
Reorganizing things this way makes it easy to make the error message
raised in the event of BufFileSize() failure descriptive of the
underlying problem. We're now clear on the distinction between
temporary file name and BufFile name, and can show errno, confident that
its value actually relates to the error being reported. In passing, an
existing, similar buffile.c ereport() + errcode_for_file_access() site
is changed to follow the same conventions.
The API of the function BufFileSize() is changed by this commit, despite
already being in a stable release (Postgres 11). This seems acceptable,
since the BufFileSize() ABI was changed by commit aa551830421, which
hasn't made it into a point release yet. Besides, it's difficult to
imagine a third party BufFileSize() caller not just raising an error
anyway, since BufFile state should be considered corrupt when
BufFileSize() fails.
Per complaint from Tom Lane.
Discussion: https://postgr.es/m/26974.1540826748@sss.pgh.pa.us
Backpatch: 11-, where shared BufFiles were introduced.
2018-11-28 23:42:54 +01:00
|
|
|
Assert(file->fileset != NULL);
|
|
|
|
|
2018-11-06 21:51:50 +01:00
|
|
|
/* Get the size of the last physical file. */
|
|
|
|
lastFileSize = FileSize(file->files[file->numFiles - 1]);
|
2018-05-02 16:23:13 +02:00
|
|
|
if (lastFileSize < 0)
|
Have BufFileSize() ereport() on FileSize() failure.
Move the responsibility for checking for and reporting a failure from
the only current BufFileSize() caller, logtape.c, to BufFileSize()
itself. Code within buffile.c is generally responsible for interfacing
with fd.c to report irrecoverable failures. This seems like a
convention that's worth sticking to.
Reorganizing things this way makes it easy to make the error message
raised in the event of BufFileSize() failure descriptive of the
underlying problem. We're now clear on the distinction between
temporary file name and BufFile name, and can show errno, confident that
its value actually relates to the error being reported. In passing, an
existing, similar buffile.c ereport() + errcode_for_file_access() site
is changed to follow the same conventions.
The API of the function BufFileSize() is changed by this commit, despite
already being in a stable release (Postgres 11). This seems acceptable,
since the BufFileSize() ABI was changed by commit aa551830421, which
hasn't made it into a point release yet. Besides, it's difficult to
imagine a third party BufFileSize() caller not just raising an error
anyway, since BufFile state should be considered corrupt when
BufFileSize() fails.
Per complaint from Tom Lane.
Discussion: https://postgr.es/m/26974.1540826748@sss.pgh.pa.us
Backpatch: 11-, where shared BufFiles were introduced.
2018-11-28 23:42:54 +01:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode_for_file_access(),
|
|
|
|
errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
|
|
|
|
FilePathName(file->files[file->numFiles - 1]),
|
|
|
|
file->name)));
|
2018-05-02 16:23:13 +02:00
|
|
|
|
2018-11-15 00:34:04 +01:00
|
|
|
return ((file->numFiles - 1) * (int64) MAX_PHYSICAL_FILESIZE) +
|
2018-05-02 16:23:13 +02:00
|
|
|
lastFileSize;
|
Support parallel btree index builds.
To make this work, tuplesort.c and logtape.c must also support
parallelism, so this patch adds that infrastructure and then applies
it to the particular case of parallel btree index builds. Testing
to date shows that this can often be 2-3x faster than a serial
index build.
The model for deciding how many workers to use is fairly primitive
at present, but it's better than not having the feature. We can
refine it as we get more experience.
Peter Geoghegan with some help from Rushabh Lathia. While Heikki
Linnakangas is not an author of this patch, he wrote other patches
without which this feature would not have been possible, and
therefore the release notes should possibly credit him as an author
of this feature. Reviewed by Claudio Freire, Heikki Linnakangas,
Thomas Munro, Tels, Amit Kapila, me.
Discussion: http://postgr.es/m/CAM3SWZQKM=Pzc=CAHzRixKjp2eO5Q0Jg1SoFQqeXFQ647JiwqQ@mail.gmail.com
Discussion: http://postgr.es/m/CAH2-Wz=AxWqDoVvGU7dq856S4r6sJAj6DBn7VMtigkB33N5eyg@mail.gmail.com
2018-02-02 19:25:55 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Append the contents of source file (managed within shared fileset) to
|
|
|
|
* end of target file (managed within same shared fileset).
|
|
|
|
*
|
|
|
|
* Note that operation subsumes ownership of underlying resources from
|
|
|
|
* "source". Caller should never call BufFileClose against source having
|
|
|
|
* called here first. Resource owners for source and target must match,
|
|
|
|
* too.
|
|
|
|
*
|
|
|
|
* This operation works by manipulating lists of segment files, so the
|
|
|
|
* file content is always appended at a MAX_PHYSICAL_FILESIZE-aligned
|
|
|
|
* boundary, typically creating empty holes before the boundary. These
|
|
|
|
* areas do not contain any interesting data, and cannot be read from by
|
|
|
|
* caller.
|
|
|
|
*
|
|
|
|
* Returns the block number within target where the contents of source
|
|
|
|
* begins. Caller should apply this as an offset when working off block
|
|
|
|
* positions that are in terms of the original BufFile space.
|
|
|
|
*/
|
|
|
|
long
|
|
|
|
BufFileAppend(BufFile *target, BufFile *source)
|
|
|
|
{
|
|
|
|
long startBlock = target->numFiles * BUFFILE_SEG_SIZE;
|
|
|
|
int newNumFiles = target->numFiles + source->numFiles;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
Assert(target->fileset != NULL);
|
|
|
|
Assert(source->readOnly);
|
|
|
|
Assert(!source->dirty);
|
|
|
|
Assert(source->fileset != NULL);
|
|
|
|
|
|
|
|
if (target->resowner != source->resowner)
|
|
|
|
elog(ERROR, "could not append BufFile with non-matching resource owner");
|
|
|
|
|
|
|
|
target->files = (File *)
|
|
|
|
repalloc(target->files, sizeof(File) * newNumFiles);
|
|
|
|
for (i = target->numFiles; i < newNumFiles; i++)
|
|
|
|
target->files[i] = source->files[i - target->numFiles];
|
|
|
|
target->numFiles = newNumFiles;
|
|
|
|
|
|
|
|
return startBlock;
|
|
|
|
}
|