1996-08-28 03:59:28 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
1999-02-14 00:22:53 +01:00
|
|
|
* bufmgr.h
|
1996-08-28 03:59:28 +02:00
|
|
|
* POSTGRES buffer manager definitions.
|
|
|
|
*
|
|
|
|
*
|
2023-01-02 21:00:37 +01:00
|
|
|
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-08-28 03:59:28 +02:00
|
|
|
*
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/include/storage/bufmgr.h
|
1996-08-28 03:59:28 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#ifndef BUFMGR_H
|
|
|
|
#define BUFMGR_H
|
|
|
|
|
2008-06-19 02:46:06 +02:00
|
|
|
#include "storage/block.h"
|
2000-11-30 02:39:08 +01:00
|
|
|
#include "storage/buf.h"
|
2008-06-09 00:00:48 +02:00
|
|
|
#include "storage/bufpage.h"
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
#include "storage/relfilelocator.h"
|
2008-06-19 02:46:06 +02:00
|
|
|
#include "utils/relcache.h"
|
2016-04-08 21:30:10 +02:00
|
|
|
#include "utils/snapmgr.h"
|
1996-08-28 03:59:28 +02:00
|
|
|
|
1998-01-13 05:05:12 +01:00
|
|
|
typedef void *Block;
|
1996-08-28 03:59:28 +02:00
|
|
|
|
2023-02-10 07:22:26 +01:00
|
|
|
/*
|
|
|
|
* Possible arguments for GetAccessStrategy().
|
|
|
|
*
|
|
|
|
* If adding a new BufferAccessStrategyType, also add a new IOContext so
|
|
|
|
* IO statistics using this strategy are tracked.
|
|
|
|
*/
|
2007-05-30 22:12:03 +02:00
|
|
|
typedef enum BufferAccessStrategyType
|
|
|
|
{
|
|
|
|
BAS_NORMAL, /* Normal random access */
|
|
|
|
BAS_BULKREAD, /* Large read-only scan (hint bit updates are
|
|
|
|
* ok) */
|
2008-11-06 21:51:15 +01:00
|
|
|
BAS_BULKWRITE, /* Large multi-block write (e.g. COPY IN) */
|
2007-05-30 22:12:03 +02:00
|
|
|
BAS_VACUUM, /* VACUUM */
|
|
|
|
} BufferAccessStrategyType;
|
|
|
|
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
2008-10-31 16:05:00 +01:00
|
|
|
/* Possible modes for ReadBufferExtended() */
|
|
|
|
typedef enum
|
|
|
|
{
|
|
|
|
RBM_NORMAL, /* Normal read */
|
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.
To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.
In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.
Backpatch to all supported versions; this has been racy since hot standby
was introduced.
2014-11-13 18:47:44 +01:00
|
|
|
RBM_ZERO_AND_LOCK, /* Don't read from disk, caller will
|
|
|
|
* initialize. Also locks the page. */
|
|
|
|
RBM_ZERO_AND_CLEANUP_LOCK, /* Like RBM_ZERO_AND_LOCK, but locks the page
|
|
|
|
* in "cleanup" mode */
|
Fix multiple bugs in index page locking during hot-standby WAL replay.
In ordinary operation, VACUUM must be careful to take a cleanup lock on
each leaf page of a btree index; this ensures that no indexscans could
still be "in flight" to heap tuples due to be deleted. (Because of
possible index-tuple motion due to concurrent page splits, it's not enough
to lock only the pages we're deleting index tuples from.) In Hot Standby,
the WAL replay process must likewise lock every leaf page. There were
several bugs in the code for that:
* The replay scan might come across unused, all-zero pages in the index.
While btree_xlog_vacuum itself did the right thing (ie, nothing) with
such pages, xlogutils.c supposed that such pages must be corrupt and
would throw an error. This accounts for various reports of replication
failures with "PANIC: WAL contains references to invalid pages". To
fix, add a ReadBufferMode value that instructs XLogReadBufferExtended
not to complain when we're doing this.
* btree_xlog_vacuum performed the extra locking if standbyState ==
STANDBY_SNAPSHOT_READY, but that's not the correct test: we won't open up
for hot standby queries until the database has reached consistency, and
we don't want to do the extra locking till then either, for fear of reading
corrupted pages (which bufmgr.c would complain about). Fix by exporting a
new function from xlog.c that will report whether we're actually in hot
standby replay mode.
* To ensure full coverage of the index in the replay scan, btvacuumscan
would emit a dummy WAL record for the last page of the index, if no
vacuuming work had been done on that page. However, if the last page
of the index is all-zero, that would result in corruption of said page,
since the functions called on it weren't prepared to handle that case.
There's no need to lock any such pages, so change the logic to target
the last normal leaf page instead.
The first two of these bugs were diagnosed by Andres Freund, the other one
by me. Fixes based on ideas from Heikki Linnakangas and myself.
This has been wrong since Hot Standby was introduced, so back-patch to 9.0.
2014-01-14 23:34:47 +01:00
|
|
|
RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */
|
|
|
|
RBM_NORMAL_NO_LOG, /* Don't log page as invalid during WAL
|
|
|
|
* replay; otherwise same as RBM_NORMAL */
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
2008-10-31 16:05:00 +01:00
|
|
|
} ReadBufferMode;
|
|
|
|
|
2020-04-08 03:36:45 +02:00
|
|
|
/*
|
|
|
|
* Type returned by PrefetchBuffer().
|
|
|
|
*/
|
|
|
|
typedef struct PrefetchBufferResult
|
|
|
|
{
|
|
|
|
Buffer recent_buffer; /* If valid, a hit (recheck needed!) */
|
|
|
|
bool initiated_io; /* If true, a miss resulting in async I/O */
|
|
|
|
} PrefetchBufferResult;
|
|
|
|
|
bufmgr: Introduce infrastructure for faster relation extension
The primary bottlenecks for relation extension are:
1) The extension lock is held while acquiring a victim buffer for the new
page. Acquiring a victim buffer can require writing out the old page
contents including possibly needing to flush WAL.
2) When extending via ReadBuffer() et al, we write a zero page during the
extension, and then later write out the actual page contents. This can
nearly double the write rate.
3) The existing bulk relation extension infrastructure in hio.c just amortized
the cost of acquiring the relation extension lock, but none of the other
costs.
Unfortunately 1) cannot currently be addressed in a central manner as the
callers to ReadBuffer() need to acquire the extension lock. To address that,
this this commit moves the responsibility for acquiring the extension lock
into bufmgr.c functions. That allows to acquire the relation extension lock
for just the required time. This will also allow us to improve relation
extension further, without changing callers.
The reason we write all-zeroes pages during relation extension is that we hope
to get ENOSPC errors earlier that way (largely works, except for CoW
filesystems). It is easier to handle out-of-space errors gracefully if the
page doesn't yet contain actual tuples. This commit addresses 2), by using the
recently introduced smgrzeroextend(), which extends the relation, without
dirtying the kernel page cache for all the extended pages.
To address 3), this commit introduces a function to extend a relation by
multiple blocks at a time.
There are three new exposed functions: ExtendBufferedRel() for extending the
relation by a single block, ExtendBufferedRelBy() to extend a relation by
multiple blocks at once, and ExtendBufferedRelTo() for extending a relation up
to a certain size.
To avoid duplicating code between ReadBuffer(P_NEW) and the new functions,
ReadBuffer(P_NEW) now implements relation extension with
ExtendBufferedRel(), using a flag to tell ExtendBufferedRel() that the
relation lock is already held.
Note that this commit does not yet lead to a meaningful performance or
scalability improvement - for that uses of ReadBuffer(P_NEW) will need to be
converted to ExtendBuffered*(), which will be done in subsequent commits.
Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/20221029025420.eplyow6k7tgu6he3@awork3.anarazel.de
2023-04-06 01:21:09 +02:00
|
|
|
/*
|
|
|
|
* Flags influencing the behaviour of ExtendBufferedRel*
|
|
|
|
*/
|
|
|
|
typedef enum ExtendBufferedFlags
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Don't acquire extension lock. This is safe only if the relation isn't
|
|
|
|
* shared, an access exclusive lock is held or if this is the startup
|
|
|
|
* process.
|
|
|
|
*/
|
|
|
|
EB_SKIP_EXTENSION_LOCK = (1 << 0),
|
|
|
|
|
|
|
|
/* Is this extension part of recovery? */
|
|
|
|
EB_PERFORMING_RECOVERY = (1 << 1),
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Should the fork be created if it does not currently exist? This likely
|
|
|
|
* only ever makes sense for relation forks.
|
|
|
|
*/
|
|
|
|
EB_CREATE_FORK_IF_NEEDED = (1 << 2),
|
|
|
|
|
|
|
|
/* Should the first (possibly only) return buffer be returned locked? */
|
|
|
|
EB_LOCK_FIRST = (1 << 3),
|
|
|
|
|
|
|
|
/* Should the smgr size cache be cleared? */
|
|
|
|
EB_CLEAR_SIZE_CACHE = (1 << 4),
|
|
|
|
|
|
|
|
/* internal flags follow */
|
|
|
|
EB_LOCK_TARGET = (1 << 5),
|
|
|
|
} ExtendBufferedFlags;
|
|
|
|
|
|
|
|
/*
|
2023-08-23 02:10:18 +02:00
|
|
|
* Some functions identify relations either by relation or smgr +
|
|
|
|
* relpersistence. Used via the BMR_REL()/BMR_SMGR() macros below. This
|
|
|
|
* allows us to use the same function for both recovery and normal operation.
|
bufmgr: Introduce infrastructure for faster relation extension
The primary bottlenecks for relation extension are:
1) The extension lock is held while acquiring a victim buffer for the new
page. Acquiring a victim buffer can require writing out the old page
contents including possibly needing to flush WAL.
2) When extending via ReadBuffer() et al, we write a zero page during the
extension, and then later write out the actual page contents. This can
nearly double the write rate.
3) The existing bulk relation extension infrastructure in hio.c just amortized
the cost of acquiring the relation extension lock, but none of the other
costs.
Unfortunately 1) cannot currently be addressed in a central manner as the
callers to ReadBuffer() need to acquire the extension lock. To address that,
this this commit moves the responsibility for acquiring the extension lock
into bufmgr.c functions. That allows to acquire the relation extension lock
for just the required time. This will also allow us to improve relation
extension further, without changing callers.
The reason we write all-zeroes pages during relation extension is that we hope
to get ENOSPC errors earlier that way (largely works, except for CoW
filesystems). It is easier to handle out-of-space errors gracefully if the
page doesn't yet contain actual tuples. This commit addresses 2), by using the
recently introduced smgrzeroextend(), which extends the relation, without
dirtying the kernel page cache for all the extended pages.
To address 3), this commit introduces a function to extend a relation by
multiple blocks at a time.
There are three new exposed functions: ExtendBufferedRel() for extending the
relation by a single block, ExtendBufferedRelBy() to extend a relation by
multiple blocks at once, and ExtendBufferedRelTo() for extending a relation up
to a certain size.
To avoid duplicating code between ReadBuffer(P_NEW) and the new functions,
ReadBuffer(P_NEW) now implements relation extension with
ExtendBufferedRel(), using a flag to tell ExtendBufferedRel() that the
relation lock is already held.
Note that this commit does not yet lead to a meaningful performance or
scalability improvement - for that uses of ReadBuffer(P_NEW) will need to be
converted to ExtendBuffered*(), which will be done in subsequent commits.
Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/20221029025420.eplyow6k7tgu6he3@awork3.anarazel.de
2023-04-06 01:21:09 +02:00
|
|
|
*/
|
2023-08-23 02:10:18 +02:00
|
|
|
typedef struct BufferManagerRelation
|
bufmgr: Introduce infrastructure for faster relation extension
The primary bottlenecks for relation extension are:
1) The extension lock is held while acquiring a victim buffer for the new
page. Acquiring a victim buffer can require writing out the old page
contents including possibly needing to flush WAL.
2) When extending via ReadBuffer() et al, we write a zero page during the
extension, and then later write out the actual page contents. This can
nearly double the write rate.
3) The existing bulk relation extension infrastructure in hio.c just amortized
the cost of acquiring the relation extension lock, but none of the other
costs.
Unfortunately 1) cannot currently be addressed in a central manner as the
callers to ReadBuffer() need to acquire the extension lock. To address that,
this this commit moves the responsibility for acquiring the extension lock
into bufmgr.c functions. That allows to acquire the relation extension lock
for just the required time. This will also allow us to improve relation
extension further, without changing callers.
The reason we write all-zeroes pages during relation extension is that we hope
to get ENOSPC errors earlier that way (largely works, except for CoW
filesystems). It is easier to handle out-of-space errors gracefully if the
page doesn't yet contain actual tuples. This commit addresses 2), by using the
recently introduced smgrzeroextend(), which extends the relation, without
dirtying the kernel page cache for all the extended pages.
To address 3), this commit introduces a function to extend a relation by
multiple blocks at a time.
There are three new exposed functions: ExtendBufferedRel() for extending the
relation by a single block, ExtendBufferedRelBy() to extend a relation by
multiple blocks at once, and ExtendBufferedRelTo() for extending a relation up
to a certain size.
To avoid duplicating code between ReadBuffer(P_NEW) and the new functions,
ReadBuffer(P_NEW) now implements relation extension with
ExtendBufferedRel(), using a flag to tell ExtendBufferedRel() that the
relation lock is already held.
Note that this commit does not yet lead to a meaningful performance or
scalability improvement - for that uses of ReadBuffer(P_NEW) will need to be
converted to ExtendBuffered*(), which will be done in subsequent commits.
Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/20221029025420.eplyow6k7tgu6he3@awork3.anarazel.de
2023-04-06 01:21:09 +02:00
|
|
|
{
|
|
|
|
Relation rel;
|
|
|
|
struct SMgrRelationData *smgr;
|
|
|
|
char relpersistence;
|
2023-08-23 02:10:18 +02:00
|
|
|
} BufferManagerRelation;
|
bufmgr: Introduce infrastructure for faster relation extension
The primary bottlenecks for relation extension are:
1) The extension lock is held while acquiring a victim buffer for the new
page. Acquiring a victim buffer can require writing out the old page
contents including possibly needing to flush WAL.
2) When extending via ReadBuffer() et al, we write a zero page during the
extension, and then later write out the actual page contents. This can
nearly double the write rate.
3) The existing bulk relation extension infrastructure in hio.c just amortized
the cost of acquiring the relation extension lock, but none of the other
costs.
Unfortunately 1) cannot currently be addressed in a central manner as the
callers to ReadBuffer() need to acquire the extension lock. To address that,
this this commit moves the responsibility for acquiring the extension lock
into bufmgr.c functions. That allows to acquire the relation extension lock
for just the required time. This will also allow us to improve relation
extension further, without changing callers.
The reason we write all-zeroes pages during relation extension is that we hope
to get ENOSPC errors earlier that way (largely works, except for CoW
filesystems). It is easier to handle out-of-space errors gracefully if the
page doesn't yet contain actual tuples. This commit addresses 2), by using the
recently introduced smgrzeroextend(), which extends the relation, without
dirtying the kernel page cache for all the extended pages.
To address 3), this commit introduces a function to extend a relation by
multiple blocks at a time.
There are three new exposed functions: ExtendBufferedRel() for extending the
relation by a single block, ExtendBufferedRelBy() to extend a relation by
multiple blocks at once, and ExtendBufferedRelTo() for extending a relation up
to a certain size.
To avoid duplicating code between ReadBuffer(P_NEW) and the new functions,
ReadBuffer(P_NEW) now implements relation extension with
ExtendBufferedRel(), using a flag to tell ExtendBufferedRel() that the
relation lock is already held.
Note that this commit does not yet lead to a meaningful performance or
scalability improvement - for that uses of ReadBuffer(P_NEW) will need to be
converted to ExtendBuffered*(), which will be done in subsequent commits.
Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/20221029025420.eplyow6k7tgu6he3@awork3.anarazel.de
2023-04-06 01:21:09 +02:00
|
|
|
|
2023-08-23 02:10:18 +02:00
|
|
|
#define BMR_REL(p_rel) ((BufferManagerRelation){.rel = p_rel})
|
|
|
|
#define BMR_SMGR(p_smgr, p_relpersistence) ((BufferManagerRelation){.smgr = p_smgr, .relpersistence = p_relpersistence})
|
bufmgr: Introduce infrastructure for faster relation extension
The primary bottlenecks for relation extension are:
1) The extension lock is held while acquiring a victim buffer for the new
page. Acquiring a victim buffer can require writing out the old page
contents including possibly needing to flush WAL.
2) When extending via ReadBuffer() et al, we write a zero page during the
extension, and then later write out the actual page contents. This can
nearly double the write rate.
3) The existing bulk relation extension infrastructure in hio.c just amortized
the cost of acquiring the relation extension lock, but none of the other
costs.
Unfortunately 1) cannot currently be addressed in a central manner as the
callers to ReadBuffer() need to acquire the extension lock. To address that,
this this commit moves the responsibility for acquiring the extension lock
into bufmgr.c functions. That allows to acquire the relation extension lock
for just the required time. This will also allow us to improve relation
extension further, without changing callers.
The reason we write all-zeroes pages during relation extension is that we hope
to get ENOSPC errors earlier that way (largely works, except for CoW
filesystems). It is easier to handle out-of-space errors gracefully if the
page doesn't yet contain actual tuples. This commit addresses 2), by using the
recently introduced smgrzeroextend(), which extends the relation, without
dirtying the kernel page cache for all the extended pages.
To address 3), this commit introduces a function to extend a relation by
multiple blocks at a time.
There are three new exposed functions: ExtendBufferedRel() for extending the
relation by a single block, ExtendBufferedRelBy() to extend a relation by
multiple blocks at once, and ExtendBufferedRelTo() for extending a relation up
to a certain size.
To avoid duplicating code between ReadBuffer(P_NEW) and the new functions,
ReadBuffer(P_NEW) now implements relation extension with
ExtendBufferedRel(), using a flag to tell ExtendBufferedRel() that the
relation lock is already held.
Note that this commit does not yet lead to a meaningful performance or
scalability improvement - for that uses of ReadBuffer(P_NEW) will need to be
converted to ExtendBuffered*(), which will be done in subsequent commits.
Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/20221029025420.eplyow6k7tgu6he3@awork3.anarazel.de
2023-04-06 01:21:09 +02:00
|
|
|
|
|
|
|
|
Allow to trigger kernel writeback after a configurable number of writes.
Currently writes to the main data files of postgres all go through the
OS page cache. This means that some operating systems can end up
collecting a large number of dirty buffers in their respective page
caches. When these dirty buffers are flushed to storage rapidly, be it
because of fsync(), timeouts, or dirty ratios, latency for other reads
and writes can increase massively. This is the primary reason for
regular massive stalls observed in real world scenarios and artificial
benchmarks; on rotating disks stalls on the order of hundreds of seconds
have been observed.
On linux it is possible to control this by reducing the global dirty
limits significantly, reducing the above problem. But global
configuration is rather problematic because it'll affect other
applications; also PostgreSQL itself doesn't always generally want this
behavior, e.g. for temporary files it's undesirable.
Several operating systems allow some control over the kernel page
cache. Linux has sync_file_range(2), several posix systems have msync(2)
and posix_fadvise(2). sync_file_range(2) is preferable because it
requires no special setup, whereas msync() requires the to-be-flushed
range to be mmap'ed. For the purpose of flushing dirty data
posix_fadvise(2) is the worst alternative, as flushing dirty data is
just a side-effect of POSIX_FADV_DONTNEED, which also removes the pages
from the page cache. Thus the feature is enabled by default only on
linux, but can be enabled on all systems that have any of the above
APIs.
While desirable and likely possible this patch does not contain an
implementation for windows.
With the infrastructure added, writes made via checkpointer, bgwriter
and normal user backends can be flushed after a configurable number of
writes. Each of these sources of writes controlled by a separate GUC,
checkpointer_flush_after, bgwriter_flush_after and backend_flush_after
respectively; they're separate because the number of flushes that are
good are separate, and because the performance considerations of
controlled flushing for each of these are different.
A later patch will add checkpoint sorting - after that flushes from the
ckeckpoint will almost always be desirable. Bgwriter flushes are most of
the time going to be random, which are slow on lots of storage hardware.
Flushing in backends works well if the storage and bgwriter can keep up,
but if not it can have negative consequences. This patch is likely to
have negative performance consequences without checkpoint sorting, but
unfortunately so has sorting without flush control.
Discussion: alpine.DEB.2.10.1506011320000.28433@sto
Author: Fabien Coelho and Andres Freund
2016-02-19 21:13:05 +01:00
|
|
|
/* forward declared, to avoid having to expose buf_internals.h here */
|
|
|
|
struct WritebackContext;
|
|
|
|
|
Skip WAL for new relfilenodes, under wal_level=minimal.
Until now, only selected bulk operations (e.g. COPY) did this. If a
given relfilenode received both a WAL-skipping COPY and a WAL-logged
operation (e.g. INSERT), recovery could lose tuples from the COPY. See
src/backend/access/transam/README section "Skipping WAL for New
RelFileNode" for the new coding rules. Maintainers of table access
methods should examine that section.
To maintain data durability, just before commit, we choose between an
fsync of the relfilenode and copying its contents to WAL. A new GUC,
wal_skip_threshold, guides that choice. If this change slows a workload
that creates small, permanent relfilenodes under wal_level=minimal, try
adjusting wal_skip_threshold. Users setting a timeout on COMMIT may
need to adjust that timeout, and log_min_duration_statement analysis
will reflect time consumption moving to COMMIT from commands like COPY.
Internally, this requires a reliable determination of whether
RollbackAndReleaseCurrentSubTransaction() would unlink a relation's
current relfilenode. Introduce rd_firstRelfilenodeSubid. Amend the
specification of rd_createSubid such that the field is zero when a new
rel has an old rd_node. Make relcache.c retain entries for certain
dropped relations until end of transaction.
Bump XLOG_PAGE_MAGIC, since this introduces XLOG_GIST_ASSIGN_LSN.
Future servers accept older WAL, so this bump is discretionary.
Kyotaro Horiguchi, reviewed (in earlier, similar versions) by Robert
Haas. Heikki Linnakangas and Michael Paquier implemented earlier
designs that materially clarified the problem. Reviewed, in earlier
designs, by Andrew Dunstan, Andres Freund, Alvaro Herrera, Tom Lane,
Fujii Masao, and Simon Riggs. Reported by Martijn van Oosterhout.
Discussion: https://postgr.es/m/20150702220524.GA9392@svana.org
2020-04-04 21:25:34 +02:00
|
|
|
/* forward declared, to avoid including smgr.h here */
|
|
|
|
struct SMgrRelationData;
|
|
|
|
|
2000-11-30 02:39:08 +01:00
|
|
|
/* in globals.c ... this duplicates miscadmin.h */
|
2007-07-25 14:22:54 +02:00
|
|
|
extern PGDLLIMPORT int NBuffers;
|
2000-11-30 02:39:08 +01:00
|
|
|
|
2003-03-28 21:17:13 +01:00
|
|
|
/* in bufmgr.c */
|
2022-04-08 14:16:38 +02:00
|
|
|
extern PGDLLIMPORT bool zero_damaged_pages;
|
|
|
|
extern PGDLLIMPORT int bgwriter_lru_maxpages;
|
|
|
|
extern PGDLLIMPORT double bgwriter_lru_multiplier;
|
|
|
|
extern PGDLLIMPORT bool track_io_timing;
|
2022-10-31 04:44:48 +01:00
|
|
|
|
|
|
|
/* only applicable when prefetching is available */
|
|
|
|
#ifdef USE_PREFETCH
|
|
|
|
#define DEFAULT_EFFECTIVE_IO_CONCURRENCY 1
|
|
|
|
#define DEFAULT_MAINTENANCE_IO_CONCURRENCY 10
|
|
|
|
#else
|
|
|
|
#define DEFAULT_EFFECTIVE_IO_CONCURRENCY 0
|
|
|
|
#define DEFAULT_MAINTENANCE_IO_CONCURRENCY 0
|
|
|
|
#endif
|
2022-04-08 14:16:38 +02:00
|
|
|
extern PGDLLIMPORT int effective_io_concurrency;
|
|
|
|
extern PGDLLIMPORT int maintenance_io_concurrency;
|
2003-03-28 21:17:13 +01:00
|
|
|
|
2022-04-08 14:16:38 +02:00
|
|
|
extern PGDLLIMPORT int checkpoint_flush_after;
|
|
|
|
extern PGDLLIMPORT int backend_flush_after;
|
|
|
|
extern PGDLLIMPORT int bgwriter_flush_after;
|
Allow to trigger kernel writeback after a configurable number of writes.
Currently writes to the main data files of postgres all go through the
OS page cache. This means that some operating systems can end up
collecting a large number of dirty buffers in their respective page
caches. When these dirty buffers are flushed to storage rapidly, be it
because of fsync(), timeouts, or dirty ratios, latency for other reads
and writes can increase massively. This is the primary reason for
regular massive stalls observed in real world scenarios and artificial
benchmarks; on rotating disks stalls on the order of hundreds of seconds
have been observed.
On linux it is possible to control this by reducing the global dirty
limits significantly, reducing the above problem. But global
configuration is rather problematic because it'll affect other
applications; also PostgreSQL itself doesn't always generally want this
behavior, e.g. for temporary files it's undesirable.
Several operating systems allow some control over the kernel page
cache. Linux has sync_file_range(2), several posix systems have msync(2)
and posix_fadvise(2). sync_file_range(2) is preferable because it
requires no special setup, whereas msync() requires the to-be-flushed
range to be mmap'ed. For the purpose of flushing dirty data
posix_fadvise(2) is the worst alternative, as flushing dirty data is
just a side-effect of POSIX_FADV_DONTNEED, which also removes the pages
from the page cache. Thus the feature is enabled by default only on
linux, but can be enabled on all systems that have any of the above
APIs.
While desirable and likely possible this patch does not contain an
implementation for windows.
With the infrastructure added, writes made via checkpointer, bgwriter
and normal user backends can be flushed after a configurable number of
writes. Each of these sources of writes controlled by a separate GUC,
checkpointer_flush_after, bgwriter_flush_after and backend_flush_after
respectively; they're separate because the number of flushes that are
good are separate, and because the performance considerations of
controlled flushing for each of these are different.
A later patch will add checkpoint sorting - after that flushes from the
ckeckpoint will almost always be desirable. Bgwriter flushes are most of
the time going to be random, which are slow on lots of storage hardware.
Flushing in backends works well if the storage and bgwriter can keep up,
but if not it can have negative consequences. This patch is likely to
have negative performance consequences without checkpoint sorting, but
unfortunately so has sorting without flush control.
Discussion: alpine.DEB.2.10.1506011320000.28433@sto
Author: Fabien Coelho and Andres Freund
2016-02-19 21:13:05 +01:00
|
|
|
|
2000-11-30 02:39:08 +01:00
|
|
|
/* in buf_init.c */
|
2007-07-25 14:22:54 +02:00
|
|
|
extern PGDLLIMPORT char *BufferBlocks;
|
2000-11-30 02:39:08 +01:00
|
|
|
|
|
|
|
/* in localbuf.c */
|
2007-07-25 14:22:54 +02:00
|
|
|
extern PGDLLIMPORT int NLocBuffer;
|
|
|
|
extern PGDLLIMPORT Block *LocalBufferBlockPointers;
|
|
|
|
extern PGDLLIMPORT int32 *LocalRefCount;
|
2000-11-30 02:39:08 +01:00
|
|
|
|
2015-09-08 17:51:42 +02:00
|
|
|
/* upper limit for effective_io_concurrency */
|
|
|
|
#define MAX_IO_CONCURRENCY 1000
|
|
|
|
|
2004-04-26 01:50:58 +02:00
|
|
|
/* special block number for ReadBuffer() */
|
1996-08-28 03:59:28 +02:00
|
|
|
#define P_NEW InvalidBlockNumber /* grow the file to get a new page */
|
|
|
|
|
2000-11-30 02:39:08 +01:00
|
|
|
/*
|
2005-03-04 21:21:07 +01:00
|
|
|
* Buffer content lock modes (mode argument for LockBuffer())
|
2000-11-30 02:39:08 +01:00
|
|
|
*/
|
|
|
|
#define BUFFER_LOCK_UNLOCK 0
|
|
|
|
#define BUFFER_LOCK_SHARE 1
|
|
|
|
#define BUFFER_LOCK_EXCLUSIVE 2
|
|
|
|
|
2016-04-20 15:31:19 +02:00
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
/*
|
|
|
|
* prototypes for functions in bufmgr.c
|
|
|
|
*/
|
2020-04-08 03:36:45 +02:00
|
|
|
extern PrefetchBufferResult PrefetchSharedBuffer(struct SMgrRelationData *smgr_reln,
|
|
|
|
ForkNumber forkNum,
|
|
|
|
BlockNumber blockNum);
|
|
|
|
extern PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum,
|
|
|
|
BlockNumber blockNum);
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
extern bool ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum,
|
2021-04-08 07:48:37 +02:00
|
|
|
BlockNumber blockNum, Buffer recent_buffer);
|
1996-08-28 03:59:28 +02:00
|
|
|
extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
2008-10-31 16:05:00 +01:00
|
|
|
extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
|
|
|
|
BlockNumber blockNum, ReadBufferMode mode,
|
|
|
|
BufferAccessStrategy strategy);
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
extern Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator,
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
2008-10-31 16:05:00 +01:00
|
|
|
ForkNumber forkNum, BlockNumber blockNum,
|
Add new block-by-block strategy for CREATE DATABASE.
Because this strategy logs changes on a block-by-block basis, it
avoids the need to checkpoint before and after the operation.
However, because it logs each changed block individually, it might
generate a lot of extra write-ahead logging if the template database
is large. Therefore, the older strategy remains available via a new
STRATEGY parameter to CREATE DATABASE, and a corresponding --strategy
option to createdb.
Somewhat controversially, this patch assembles the list of relations
to be copied to the new database by reading the pg_class relation of
the template database. Cross-database access like this isn't normally
possible, but it can be made to work here because there can't be any
connections to the database being copied, nor can it contain any
in-doubt transactions. Even so, we have to use lower-level interfaces
than normal, since the table scan and relcache interfaces will not
work for a database to which we're not connected. The advantage of
this approach is that we do not need to rely on the filesystem to
determine what ought to be copied, but instead on PostgreSQL's own
knowledge of the database structure. This avoids, for example,
copying stray files that happen to be located in the source database
directory.
Dilip Kumar, with a fairly large number of cosmetic changes by me.
Reviewed and tested by Ashutosh Sharma, Andres Freund, John Naylor,
Greg Nancarrow, Neha Sharma. Additional feedback from Bruce Momjian,
Heikki Linnakangas, Julien Rouhaud, Adam Brusselback, Kyotaro
Horiguchi, Tomas Vondra, Andrew Dunstan, Álvaro Herrera, and others.
Discussion: http://postgr.es/m/CA+TgmoYtcdxBjLh31DLxUXHxFVMPGzrU5_T=CYCvRyFHywSBUQ@mail.gmail.com
2022-03-29 17:31:43 +02:00
|
|
|
ReadBufferMode mode, BufferAccessStrategy strategy,
|
|
|
|
bool permanent);
|
2004-04-21 20:06:30 +02:00
|
|
|
extern void ReleaseBuffer(Buffer buffer);
|
2006-04-01 01:32:07 +02:00
|
|
|
extern void UnlockReleaseBuffer(Buffer buffer);
|
2023-10-24 02:17:46 +02:00
|
|
|
extern bool BufferIsExclusiveLocked(Buffer buffer);
|
|
|
|
extern bool BufferIsDirty(Buffer buffer);
|
2006-04-01 01:32:07 +02:00
|
|
|
extern void MarkBufferDirty(Buffer buffer);
|
2004-07-17 05:32:14 +02:00
|
|
|
extern void IncrBufferRefCount(Buffer buffer);
|
2023-04-05 19:42:17 +02:00
|
|
|
extern void CheckBufferIsPinnedOnce(Buffer buffer);
|
1998-01-24 23:50:57 +01:00
|
|
|
extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
|
2001-06-29 23:08:25 +02:00
|
|
|
BlockNumber blockNum);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2023-08-23 02:10:18 +02:00
|
|
|
extern Buffer ExtendBufferedRel(BufferManagerRelation bmr,
|
bufmgr: Introduce infrastructure for faster relation extension
The primary bottlenecks for relation extension are:
1) The extension lock is held while acquiring a victim buffer for the new
page. Acquiring a victim buffer can require writing out the old page
contents including possibly needing to flush WAL.
2) When extending via ReadBuffer() et al, we write a zero page during the
extension, and then later write out the actual page contents. This can
nearly double the write rate.
3) The existing bulk relation extension infrastructure in hio.c just amortized
the cost of acquiring the relation extension lock, but none of the other
costs.
Unfortunately 1) cannot currently be addressed in a central manner as the
callers to ReadBuffer() need to acquire the extension lock. To address that,
this this commit moves the responsibility for acquiring the extension lock
into bufmgr.c functions. That allows to acquire the relation extension lock
for just the required time. This will also allow us to improve relation
extension further, without changing callers.
The reason we write all-zeroes pages during relation extension is that we hope
to get ENOSPC errors earlier that way (largely works, except for CoW
filesystems). It is easier to handle out-of-space errors gracefully if the
page doesn't yet contain actual tuples. This commit addresses 2), by using the
recently introduced smgrzeroextend(), which extends the relation, without
dirtying the kernel page cache for all the extended pages.
To address 3), this commit introduces a function to extend a relation by
multiple blocks at a time.
There are three new exposed functions: ExtendBufferedRel() for extending the
relation by a single block, ExtendBufferedRelBy() to extend a relation by
multiple blocks at once, and ExtendBufferedRelTo() for extending a relation up
to a certain size.
To avoid duplicating code between ReadBuffer(P_NEW) and the new functions,
ReadBuffer(P_NEW) now implements relation extension with
ExtendBufferedRel(), using a flag to tell ExtendBufferedRel() that the
relation lock is already held.
Note that this commit does not yet lead to a meaningful performance or
scalability improvement - for that uses of ReadBuffer(P_NEW) will need to be
converted to ExtendBuffered*(), which will be done in subsequent commits.
Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/20221029025420.eplyow6k7tgu6he3@awork3.anarazel.de
2023-04-06 01:21:09 +02:00
|
|
|
ForkNumber forkNum,
|
|
|
|
BufferAccessStrategy strategy,
|
|
|
|
uint32 flags);
|
2023-08-23 02:10:18 +02:00
|
|
|
extern BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr,
|
bufmgr: Introduce infrastructure for faster relation extension
The primary bottlenecks for relation extension are:
1) The extension lock is held while acquiring a victim buffer for the new
page. Acquiring a victim buffer can require writing out the old page
contents including possibly needing to flush WAL.
2) When extending via ReadBuffer() et al, we write a zero page during the
extension, and then later write out the actual page contents. This can
nearly double the write rate.
3) The existing bulk relation extension infrastructure in hio.c just amortized
the cost of acquiring the relation extension lock, but none of the other
costs.
Unfortunately 1) cannot currently be addressed in a central manner as the
callers to ReadBuffer() need to acquire the extension lock. To address that,
this this commit moves the responsibility for acquiring the extension lock
into bufmgr.c functions. That allows to acquire the relation extension lock
for just the required time. This will also allow us to improve relation
extension further, without changing callers.
The reason we write all-zeroes pages during relation extension is that we hope
to get ENOSPC errors earlier that way (largely works, except for CoW
filesystems). It is easier to handle out-of-space errors gracefully if the
page doesn't yet contain actual tuples. This commit addresses 2), by using the
recently introduced smgrzeroextend(), which extends the relation, without
dirtying the kernel page cache for all the extended pages.
To address 3), this commit introduces a function to extend a relation by
multiple blocks at a time.
There are three new exposed functions: ExtendBufferedRel() for extending the
relation by a single block, ExtendBufferedRelBy() to extend a relation by
multiple blocks at once, and ExtendBufferedRelTo() for extending a relation up
to a certain size.
To avoid duplicating code between ReadBuffer(P_NEW) and the new functions,
ReadBuffer(P_NEW) now implements relation extension with
ExtendBufferedRel(), using a flag to tell ExtendBufferedRel() that the
relation lock is already held.
Note that this commit does not yet lead to a meaningful performance or
scalability improvement - for that uses of ReadBuffer(P_NEW) will need to be
converted to ExtendBuffered*(), which will be done in subsequent commits.
Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/20221029025420.eplyow6k7tgu6he3@awork3.anarazel.de
2023-04-06 01:21:09 +02:00
|
|
|
ForkNumber fork,
|
|
|
|
BufferAccessStrategy strategy,
|
|
|
|
uint32 flags,
|
|
|
|
uint32 extend_by,
|
|
|
|
Buffer *buffers,
|
|
|
|
uint32 *extended_by);
|
2023-08-23 02:10:18 +02:00
|
|
|
extern Buffer ExtendBufferedRelTo(BufferManagerRelation bmr,
|
bufmgr: Introduce infrastructure for faster relation extension
The primary bottlenecks for relation extension are:
1) The extension lock is held while acquiring a victim buffer for the new
page. Acquiring a victim buffer can require writing out the old page
contents including possibly needing to flush WAL.
2) When extending via ReadBuffer() et al, we write a zero page during the
extension, and then later write out the actual page contents. This can
nearly double the write rate.
3) The existing bulk relation extension infrastructure in hio.c just amortized
the cost of acquiring the relation extension lock, but none of the other
costs.
Unfortunately 1) cannot currently be addressed in a central manner as the
callers to ReadBuffer() need to acquire the extension lock. To address that,
this this commit moves the responsibility for acquiring the extension lock
into bufmgr.c functions. That allows to acquire the relation extension lock
for just the required time. This will also allow us to improve relation
extension further, without changing callers.
The reason we write all-zeroes pages during relation extension is that we hope
to get ENOSPC errors earlier that way (largely works, except for CoW
filesystems). It is easier to handle out-of-space errors gracefully if the
page doesn't yet contain actual tuples. This commit addresses 2), by using the
recently introduced smgrzeroextend(), which extends the relation, without
dirtying the kernel page cache for all the extended pages.
To address 3), this commit introduces a function to extend a relation by
multiple blocks at a time.
There are three new exposed functions: ExtendBufferedRel() for extending the
relation by a single block, ExtendBufferedRelBy() to extend a relation by
multiple blocks at once, and ExtendBufferedRelTo() for extending a relation up
to a certain size.
To avoid duplicating code between ReadBuffer(P_NEW) and the new functions,
ReadBuffer(P_NEW) now implements relation extension with
ExtendBufferedRel(), using a flag to tell ExtendBufferedRel() that the
relation lock is already held.
Note that this commit does not yet lead to a meaningful performance or
scalability improvement - for that uses of ReadBuffer(P_NEW) will need to be
converted to ExtendBuffered*(), which will be done in subsequent commits.
Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/20221029025420.eplyow6k7tgu6he3@awork3.anarazel.de
2023-04-06 01:21:09 +02:00
|
|
|
ForkNumber fork,
|
|
|
|
BufferAccessStrategy strategy,
|
|
|
|
uint32 flags,
|
|
|
|
BlockNumber extend_to,
|
|
|
|
ReadBufferMode mode);
|
|
|
|
|
2000-12-18 01:44:50 +01:00
|
|
|
extern void InitBufferPoolAccess(void);
|
2002-08-06 04:36:35 +02:00
|
|
|
extern void AtEOXact_Buffers(bool isCommit);
|
Make ResourceOwners more easily extensible.
Instead of having a separate array/hash for each resource kind, use a
single array and hash to hold all kinds of resources. This makes it
possible to introduce new resource "kinds" without having to modify
the ResourceOwnerData struct. In particular, this makes it possible
for extensions to register custom resource kinds.
The old approach was to have a small array of resources of each kind,
and if it fills up, switch to a hash table. The new approach also uses
an array and a hash, but now the array and the hash are used at the
same time. The array is used to hold the recently added resources, and
when it fills up, they are moved to the hash. This keeps the access to
recent entries fast, even when there are a lot of long-held resources.
All the resource-specific ResourceOwnerEnlarge*(),
ResourceOwnerRemember*(), and ResourceOwnerForget*() functions have
been replaced with three generic functions that take resource kind as
argument. For convenience, we still define resource-specific wrapper
macros around the generic functions with the old names, but they are
now defined in the source files that use those resource kinds.
The release callback no longer needs to call ResourceOwnerForget on
the resource being released. ResourceOwnerRelease unregisters the
resource from the owner before calling the callback. That needed some
changes in bufmgr.c and some other files, where releasing the
resources previously always called ResourceOwnerForget.
Each resource kind specifies a release priority, and
ResourceOwnerReleaseAll releases the resources in priority order. To
make that possible, we have to restrict what you can do between
phases. After calling ResourceOwnerRelease(), you are no longer
allowed to remember any more resources in it or to forget any
previously remembered resources by calling ResourceOwnerForget. There
was one case where that was done previously. At subtransaction commit,
AtEOSubXact_Inval() would handle the invalidation messages and call
RelationFlushRelation(), which temporarily increased the reference
count on the relation being flushed. We now switch to the parent
subtransaction's resource owner before calling AtEOSubXact_Inval(), so
that there is a valid ResourceOwner to temporarily hold that relcache
reference.
Other end-of-xact routines make similar calls to AtEOXact_Inval()
between release phases, but I didn't see any regression test failures
from those, so I'm not sure if they could reach a codepath that needs
remembering extra resources.
There were two exceptions to how the resource leak WARNINGs on commit
were printed previously: llvmjit silently released the context without
printing the warning, and a leaked buffer io triggered a PANIC. Now
everything prints a WARNING, including those cases.
Add tests in src/test/modules/test_resowner.
Reviewed-by: Aleksander Alekseev, Michael Paquier, Julien Rouhaud
Reviewed-by: Kyotaro Horiguchi, Hayato Kuroda, Álvaro Herrera, Zhihong Yu
Reviewed-by: Peter Eisentraut, Andres Freund
Discussion: https://www.postgresql.org/message-id/cbfabeb0-cd3c-e951-a572-19b365ed314d%40iki.fi
2023-11-08 12:30:50 +01:00
|
|
|
extern char *DebugPrintBufferRefcount(Buffer buffer);
|
2007-06-28 02:02:40 +02:00
|
|
|
extern void CheckPointBuffers(int flags);
|
1996-08-28 03:59:28 +02:00
|
|
|
extern BlockNumber BufferGetBlockNumber(Buffer buffer);
|
2010-12-29 12:48:53 +01:00
|
|
|
extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation,
|
|
|
|
ForkNumber forkNum);
|
2015-12-10 16:25:12 +01:00
|
|
|
extern void FlushOneBuffer(Buffer buffer);
|
2005-03-20 23:00:54 +01:00
|
|
|
extern void FlushRelationBuffers(Relation rel);
|
Skip WAL for new relfilenodes, under wal_level=minimal.
Until now, only selected bulk operations (e.g. COPY) did this. If a
given relfilenode received both a WAL-skipping COPY and a WAL-logged
operation (e.g. INSERT), recovery could lose tuples from the COPY. See
src/backend/access/transam/README section "Skipping WAL for New
RelFileNode" for the new coding rules. Maintainers of table access
methods should examine that section.
To maintain data durability, just before commit, we choose between an
fsync of the relfilenode and copying its contents to WAL. A new GUC,
wal_skip_threshold, guides that choice. If this change slows a workload
that creates small, permanent relfilenodes under wal_level=minimal, try
adjusting wal_skip_threshold. Users setting a timeout on COMMIT may
need to adjust that timeout, and log_min_duration_statement analysis
will reflect time consumption moving to COMMIT from commands like COPY.
Internally, this requires a reliable determination of whether
RollbackAndReleaseCurrentSubTransaction() would unlink a relation's
current relfilenode. Introduce rd_firstRelfilenodeSubid. Amend the
specification of rd_createSubid such that the field is zero when a new
rel has an old rd_node. Make relcache.c retain entries for certain
dropped relations until end of transaction.
Bump XLOG_PAGE_MAGIC, since this introduces XLOG_GIST_ASSIGN_LSN.
Future servers accept older WAL, so this bump is discretionary.
Kyotaro Horiguchi, reviewed (in earlier, similar versions) by Robert
Haas. Heikki Linnakangas and Michael Paquier implemented earlier
designs that materially clarified the problem. Reviewed, in earlier
designs, by Andrew Dunstan, Andres Freund, Alvaro Herrera, Tom Lane,
Fujii Masao, and Simon Riggs. Reported by Martijn van Oosterhout.
Discussion: https://postgr.es/m/20150702220524.GA9392@svana.org
2020-04-04 21:25:34 +02:00
|
|
|
extern void FlushRelationsAllBuffers(struct SMgrRelationData **smgrs, int nrels);
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
extern void CreateAndCopyRelationData(RelFileLocator src_rlocator,
|
|
|
|
RelFileLocator dst_rlocator,
|
Add new block-by-block strategy for CREATE DATABASE.
Because this strategy logs changes on a block-by-block basis, it
avoids the need to checkpoint before and after the operation.
However, because it logs each changed block individually, it might
generate a lot of extra write-ahead logging if the template database
is large. Therefore, the older strategy remains available via a new
STRATEGY parameter to CREATE DATABASE, and a corresponding --strategy
option to createdb.
Somewhat controversially, this patch assembles the list of relations
to be copied to the new database by reading the pg_class relation of
the template database. Cross-database access like this isn't normally
possible, but it can be made to work here because there can't be any
connections to the database being copied, nor can it contain any
in-doubt transactions. Even so, we have to use lower-level interfaces
than normal, since the table scan and relcache interfaces will not
work for a database to which we're not connected. The advantage of
this approach is that we do not need to rely on the filesystem to
determine what ought to be copied, but instead on PostgreSQL's own
knowledge of the database structure. This avoids, for example,
copying stray files that happen to be located in the source database
directory.
Dilip Kumar, with a fairly large number of cosmetic changes by me.
Reviewed and tested by Ashutosh Sharma, Andres Freund, John Naylor,
Greg Nancarrow, Neha Sharma. Additional feedback from Bruce Momjian,
Heikki Linnakangas, Julien Rouhaud, Adam Brusselback, Kyotaro
Horiguchi, Tomas Vondra, Andrew Dunstan, Álvaro Herrera, and others.
Discussion: http://postgr.es/m/CA+TgmoYtcdxBjLh31DLxUXHxFVMPGzrU5_T=CYCvRyFHywSBUQ@mail.gmail.com
2022-03-29 17:31:43 +02:00
|
|
|
bool permanent);
|
2007-06-28 02:02:40 +02:00
|
|
|
extern void FlushDatabaseBuffers(Oid dbid);
|
2022-07-12 16:26:48 +02:00
|
|
|
extern void DropRelationBuffers(struct SMgrRelationData *smgr_reln,
|
|
|
|
ForkNumber *forkNum,
|
|
|
|
int nforks, BlockNumber *firstDelBlock);
|
|
|
|
extern void DropRelationsAllBuffers(struct SMgrRelationData **smgr_reln,
|
|
|
|
int nlocators);
|
2006-03-29 23:17:39 +02:00
|
|
|
extern void DropDatabaseBuffers(Oid dbid);
|
2002-09-04 22:31:48 +02:00
|
|
|
|
2022-07-13 19:37:10 +02:00
|
|
|
#define RelationGetNumberOfBlocks(reln) \
|
|
|
|
RelationGetNumberOfBlocksInFork(reln, MAIN_FORKNUM)
|
|
|
|
|
2011-10-28 23:08:09 +02:00
|
|
|
extern bool BufferIsPermanent(Buffer buffer);
|
2013-03-22 14:54:07 +01:00
|
|
|
extern XLogRecPtr BufferGetLSNAtomic(Buffer buffer);
|
2011-10-28 23:08:09 +02:00
|
|
|
|
2002-07-02 07:47:37 +02:00
|
|
|
#ifdef NOT_USED
|
1996-08-28 03:59:28 +02:00
|
|
|
extern void PrintPinnedBufs(void);
|
2002-07-02 07:47:37 +02:00
|
|
|
#endif
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
extern void BufferGetTag(Buffer buffer, RelFileLocator *rlocator,
|
2008-08-11 13:05:11 +02:00
|
|
|
ForkNumber *forknum, BlockNumber *blknum);
|
1997-09-08 04:41:22 +02:00
|
|
|
|
2013-06-17 17:02:12 +02:00
|
|
|
extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1998-12-15 13:47:01 +01:00
|
|
|
extern void UnlockBuffers(void);
|
|
|
|
extern void LockBuffer(Buffer buffer, int mode);
|
2003-08-10 21:48:08 +02:00
|
|
|
extern bool ConditionalLockBuffer(Buffer buffer);
|
2001-07-06 23:04:26 +02:00
|
|
|
extern void LockBufferForCleanup(Buffer buffer);
|
2007-09-20 19:56:33 +02:00
|
|
|
extern bool ConditionalLockBufferForCleanup(Buffer buffer);
|
2016-11-04 14:27:49 +01:00
|
|
|
extern bool IsBufferCleanupOK(Buffer buffer);
|
2010-01-23 17:37:12 +01:00
|
|
|
extern bool HoldingBufferPinThatDelaysRecovery(void);
|
1998-12-15 13:47:01 +01:00
|
|
|
|
Allow to trigger kernel writeback after a configurable number of writes.
Currently writes to the main data files of postgres all go through the
OS page cache. This means that some operating systems can end up
collecting a large number of dirty buffers in their respective page
caches. When these dirty buffers are flushed to storage rapidly, be it
because of fsync(), timeouts, or dirty ratios, latency for other reads
and writes can increase massively. This is the primary reason for
regular massive stalls observed in real world scenarios and artificial
benchmarks; on rotating disks stalls on the order of hundreds of seconds
have been observed.
On linux it is possible to control this by reducing the global dirty
limits significantly, reducing the above problem. But global
configuration is rather problematic because it'll affect other
applications; also PostgreSQL itself doesn't always generally want this
behavior, e.g. for temporary files it's undesirable.
Several operating systems allow some control over the kernel page
cache. Linux has sync_file_range(2), several posix systems have msync(2)
and posix_fadvise(2). sync_file_range(2) is preferable because it
requires no special setup, whereas msync() requires the to-be-flushed
range to be mmap'ed. For the purpose of flushing dirty data
posix_fadvise(2) is the worst alternative, as flushing dirty data is
just a side-effect of POSIX_FADV_DONTNEED, which also removes the pages
from the page cache. Thus the feature is enabled by default only on
linux, but can be enabled on all systems that have any of the above
APIs.
While desirable and likely possible this patch does not contain an
implementation for windows.
With the infrastructure added, writes made via checkpointer, bgwriter
and normal user backends can be flushed after a configurable number of
writes. Each of these sources of writes controlled by a separate GUC,
checkpointer_flush_after, bgwriter_flush_after and backend_flush_after
respectively; they're separate because the number of flushes that are
good are separate, and because the performance considerations of
controlled flushing for each of these are different.
A later patch will add checkpoint sorting - after that flushes from the
ckeckpoint will almost always be desirable. Bgwriter flushes are most of
the time going to be random, which are slow on lots of storage hardware.
Flushing in backends works well if the storage and bgwriter can keep up,
but if not it can have negative consequences. This patch is likely to
have negative performance consequences without checkpoint sorting, but
unfortunately so has sorting without flush control.
Discussion: alpine.DEB.2.10.1506011320000.28433@sto
Author: Fabien Coelho and Andres Freund
2016-02-19 21:13:05 +01:00
|
|
|
extern bool BgBufferSync(struct WritebackContext *wb_context);
|
2000-10-28 18:21:00 +02:00
|
|
|
|
2022-08-04 10:36:21 +02:00
|
|
|
/* in buf_init.c */
|
|
|
|
extern void InitBufferPool(void);
|
|
|
|
extern Size BufferShmemSize(void);
|
|
|
|
|
|
|
|
/* in localbuf.c */
|
|
|
|
extern void AtProcExit_LocalBuffers(void);
|
|
|
|
|
2005-03-04 21:21:07 +01:00
|
|
|
/* in freelist.c */
|
2023-04-07 01:40:31 +02:00
|
|
|
|
2007-05-30 22:12:03 +02:00
|
|
|
extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype);
|
2023-04-07 01:40:31 +02:00
|
|
|
extern BufferAccessStrategy GetAccessStrategyWithSize(BufferAccessStrategyType btype,
|
|
|
|
int ring_size_kb);
|
|
|
|
extern int GetAccessStrategyBufferCount(BufferAccessStrategy strategy);
|
|
|
|
|
2007-05-30 22:12:03 +02:00
|
|
|
extern void FreeAccessStrategy(BufferAccessStrategy strategy);
|
2005-03-04 21:21:07 +01:00
|
|
|
|
2016-04-11 23:47:50 +02:00
|
|
|
|
|
|
|
/* inline functions */
|
|
|
|
|
2016-04-15 16:44:28 +02:00
|
|
|
/*
|
|
|
|
* Although this header file is nominally backend-only, certain frontend
|
2017-02-09 22:23:46 +01:00
|
|
|
* programs like pg_waldump include it. For compilers that emit static
|
2016-04-15 16:44:28 +02:00
|
|
|
* inline functions even when they're unused, that leads to unsatisfied
|
|
|
|
* external references; hence hide these with #ifndef FRONTEND.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef FRONTEND
|
|
|
|
|
2022-07-13 19:37:10 +02:00
|
|
|
/*
|
|
|
|
* BufferIsValid
|
|
|
|
* True iff the given buffer number is valid (either as a shared
|
|
|
|
* or local buffer).
|
|
|
|
*
|
|
|
|
* Note: For a long time this was defined the same as BufferIsPinned,
|
|
|
|
* that is it would say False if you didn't hold a pin on the buffer.
|
|
|
|
* I believe this was bogus and served only to mask logic errors.
|
|
|
|
* Code should always know whether it has a buffer reference,
|
|
|
|
* independently of the pin state.
|
|
|
|
*
|
|
|
|
* Note: For a further long time this was not quite the inverse of the
|
|
|
|
* BufferIsInvalid() macro, in that it also did sanity checks to verify
|
|
|
|
* that the buffer number was in range. Most likely, this macro was
|
|
|
|
* originally intended only to be used in assertions, but its use has
|
|
|
|
* since expanded quite a bit, and the overhead of making those checks
|
|
|
|
* even in non-assert-enabled builds can be significant. Thus, we've
|
|
|
|
* now demoted the range checks to assertions within the macro itself.
|
|
|
|
*/
|
|
|
|
static inline bool
|
|
|
|
BufferIsValid(Buffer bufnum)
|
|
|
|
{
|
|
|
|
Assert(bufnum <= NBuffers);
|
|
|
|
Assert(bufnum >= -NLocBuffer);
|
|
|
|
|
|
|
|
return bufnum != InvalidBuffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* BufferGetBlock
|
|
|
|
* Returns a reference to a disk page image associated with a buffer.
|
|
|
|
*
|
|
|
|
* Note:
|
|
|
|
* Assumes buffer is valid.
|
|
|
|
*/
|
|
|
|
static inline Block
|
|
|
|
BufferGetBlock(Buffer buffer)
|
|
|
|
{
|
|
|
|
Assert(BufferIsValid(buffer));
|
|
|
|
|
|
|
|
if (BufferIsLocal(buffer))
|
|
|
|
return LocalBufferBlockPointers[-buffer - 1];
|
|
|
|
else
|
|
|
|
return (Block) (BufferBlocks + ((Size) (buffer - 1)) * BLCKSZ);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* BufferGetPageSize
|
|
|
|
* Returns the page size within a buffer.
|
|
|
|
*
|
|
|
|
* Notes:
|
|
|
|
* Assumes buffer is valid.
|
|
|
|
*
|
|
|
|
* The buffer can be a raw disk block and need not contain a valid
|
|
|
|
* (formatted) disk page.
|
|
|
|
*/
|
|
|
|
/* XXX should dig out of buffer descriptor */
|
|
|
|
static inline Size
|
|
|
|
BufferGetPageSize(Buffer buffer)
|
|
|
|
{
|
|
|
|
AssertMacro(BufferIsValid(buffer));
|
|
|
|
return (Size) BLCKSZ;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* BufferGetPage
|
|
|
|
* Returns the page associated with a buffer.
|
|
|
|
*/
|
|
|
|
static inline Page
|
|
|
|
BufferGetPage(Buffer buffer)
|
|
|
|
{
|
|
|
|
return (Page) BufferGetBlock(buffer);
|
|
|
|
}
|
|
|
|
|
2016-04-15 16:44:28 +02:00
|
|
|
#endif /* FRONTEND */
|
|
|
|
|
|
|
|
#endif /* BUFMGR_H */
|