2000-10-13 14:06:40 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
2000-10-20 13:01:21 +02:00
|
|
|
* xlogutils.c
|
2000-10-13 14:06:40 +02:00
|
|
|
*
|
2017-05-12 17:49:56 +02:00
|
|
|
* PostgreSQL write-ahead log manager utility routines
|
2004-07-22 00:31:26 +02:00
|
|
|
*
|
|
|
|
* This file contains support routines that are used by XLOG replay functions.
|
|
|
|
* None of this code is used during normal system operation.
|
|
|
|
*
|
2000-10-13 14:06:40 +02:00
|
|
|
*
|
2022-01-08 01:04:57 +01:00
|
|
|
* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
|
2000-10-13 14:06:40 +02:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
|
|
*
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/access/transam/xlogutils.c
|
XLOG (and related) changes:
* Store two past checkpoint locations, not just one, in pg_control.
On startup, we fall back to the older checkpoint if the newer one
is unreadable. Also, a physical copy of the newest checkpoint record
is kept in pg_control for possible use in disaster recovery (ie,
complete loss of pg_xlog). Also add a version number for pg_control
itself. Remove archdir from pg_control; it ought to be a GUC
parameter, not a special case (not that it's implemented yet anyway).
* Suppress successive checkpoint records when nothing has been entered
in the WAL log since the last one. This is not so much to avoid I/O
as to make it actually useful to keep track of the last two
checkpoints. If the things are right next to each other then there's
not a lot of redundancy gained...
* Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs
on alternate bytes. Polynomial borrowed from ECMA DLT1 standard.
* Fix XLOG record length handling so that it will work at BLCKSZ = 32k.
* Change XID allocation to work more like OID allocation. (This is of
dubious necessity, but I think it's a good idea anyway.)
* Fix a number of minor bugs, such as off-by-one logic for XLOG file
wraparound at the 4 gig mark.
* Add documentation and clean up some coding infelicities; move file
format declarations out to include files where planned contrib
utilities can get at them.
* Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or
every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also
possible to force a checkpoint by sending SIGUSR1 to the postmaster
(undocumented feature...)
* Defend against kill -9 postmaster by storing shmem block's key and ID
in postmaster.pid lockfile, and checking at startup to ensure that no
processes are still connected to old shmem block (if it still exists).
* Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency
stop, for symmetry with postmaster and xlog utilities. Clean up signal
handling in bootstrap.c so that xlog utilities launched by postmaster
will react to signals better.
* Standalone bootstrap now grabs lockfile in target directory, as added
insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
|
|
|
*
|
2000-10-13 14:06:40 +02:00
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
XLOG (and related) changes:
* Store two past checkpoint locations, not just one, in pg_control.
On startup, we fall back to the older checkpoint if the newer one
is unreadable. Also, a physical copy of the newest checkpoint record
is kept in pg_control for possible use in disaster recovery (ie,
complete loss of pg_xlog). Also add a version number for pg_control
itself. Remove archdir from pg_control; it ought to be a GUC
parameter, not a special case (not that it's implemented yet anyway).
* Suppress successive checkpoint records when nothing has been entered
in the WAL log since the last one. This is not so much to avoid I/O
as to make it actually useful to keep track of the last two
checkpoints. If the things are right next to each other then there's
not a lot of redundancy gained...
* Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs
on alternate bytes. Polynomial borrowed from ECMA DLT1 standard.
* Fix XLOG record length handling so that it will work at BLCKSZ = 32k.
* Change XID allocation to work more like OID allocation. (This is of
dubious necessity, but I think it's a good idea anyway.)
* Fix a number of minor bugs, such as off-by-one logic for XLOG file
wraparound at the 4 gig mark.
* Add documentation and clean up some coding infelicities; move file
format declarations out to include files where planned contrib
utilities can get at them.
* Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or
every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also
possible to force a checkpoint by sending SIGUSR1 to the postmaster
(undocumented feature...)
* Defend against kill -9 postmaster by storing shmem block's key and ID
in postmaster.pid lockfile, and checking at startup to ensure that no
processes are still connected to old shmem block (if it still exists).
* Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency
stop, for symmetry with postmaster and xlog utilities. Clean up signal
handling in bootstrap.c so that xlog utilities launched by postmaster
will react to signals better.
* Standalone bootstrap now grabs lockfile in target directory, as added
insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
|
|
|
|
2016-01-21 02:18:58 +01:00
|
|
|
#include <unistd.h>
|
|
|
|
|
2017-03-22 08:05:12 +01:00
|
|
|
#include "access/timeline.h"
|
2022-02-16 08:30:38 +01:00
|
|
|
#include "access/xlogrecovery.h"
|
2016-01-21 02:18:58 +01:00
|
|
|
#include "access/xlog_internal.h"
|
2022-04-07 09:28:40 +02:00
|
|
|
#include "access/xlogprefetcher.h"
|
2000-10-20 13:01:21 +02:00
|
|
|
#include "access/xlogutils.h"
|
2016-03-30 23:56:13 +02:00
|
|
|
#include "miscadmin.h"
|
Create and use wait events for read, write, and fsync operations.
Previous commits, notably 53be0b1add7064ca5db3cd884302dfc3268d884e and
6f3bd98ebfc008cbd676da777bb0b2376c4c4bfa, made it possible to see from
pg_stat_activity when a backend was stuck waiting for another backend,
but it's also fairly common for a backend to be stuck waiting for an
I/O. Add wait events for those operations, too.
Rushabh Lathia, with further hacking by me. Reviewed and tested by
Michael Paquier, Amit Kapila, Rajkumar Raghuwanshi, and Rahila Syed.
Discussion: http://postgr.es/m/CAGPqQf0LsYHXREPAZqYGVkDqHSyjf=KsD=k0GTVPAuzyThh-VQ@mail.gmail.com
2017-03-18 12:43:01 +01:00
|
|
|
#include "pgstat.h"
|
2021-07-31 08:50:26 +02:00
|
|
|
#include "storage/fd.h"
|
2001-08-25 20:52:43 +02:00
|
|
|
#include "storage/smgr.h"
|
2011-09-04 07:13:16 +02:00
|
|
|
#include "utils/guc.h"
|
|
|
|
#include "utils/hsearch.h"
|
2008-06-19 02:46:06 +02:00
|
|
|
#include "utils/rel.h"
|
2000-10-13 14:06:40 +02:00
|
|
|
|
2001-08-25 20:52:43 +02:00
|
|
|
|
2020-01-22 03:56:34 +01:00
|
|
|
/* GUC variable */
|
|
|
|
bool ignore_invalid_pages = false;
|
|
|
|
|
2021-07-31 08:50:26 +02:00
|
|
|
/*
|
|
|
|
* Are we doing recovery from XLOG?
|
|
|
|
*
|
|
|
|
* This is only ever true in the startup process; it should be read as meaning
|
|
|
|
* "this process is replaying WAL records", rather than "the system is in
|
|
|
|
* recovery mode". It should be examined primarily by functions that need
|
|
|
|
* to act differently when called from a WAL redo function (e.g., to skip WAL
|
|
|
|
* logging). To check whether the system is in recovery regardless of which
|
|
|
|
* process you're running in, use RecoveryInProgress() but only after shared
|
|
|
|
* memory startup and lock initialization.
|
|
|
|
*
|
2022-02-16 08:30:38 +01:00
|
|
|
* This is updated from xlog.c and xlogrecovery.c, but lives here because
|
|
|
|
* it's mostly read by WAL redo functions.
|
2021-07-31 08:50:26 +02:00
|
|
|
*/
|
|
|
|
bool InRecovery = false;
|
|
|
|
|
|
|
|
/* Are we in Hot Standby mode? Only valid in startup process, see xlogutils.h */
|
|
|
|
HotStandbyState standbyState = STANDBY_DISABLED;
|
|
|
|
|
2006-04-14 22:27:24 +02:00
|
|
|
/*
|
|
|
|
* During XLOG replay, we may see XLOG records for incremental updates of
|
|
|
|
* pages that no longer exist, because their relation was later dropped or
|
|
|
|
* truncated. (Note: this is only possible when full_page_writes = OFF,
|
|
|
|
* since when it's ON, the first reference we see to a page should always
|
|
|
|
* be a full-page rewrite not an incremental update.) Rather than simply
|
|
|
|
* ignoring such records, we make a note of the referenced page, and then
|
|
|
|
* complain if we don't actually see a drop or truncate covering the page
|
|
|
|
* later in replay.
|
|
|
|
*/
|
|
|
|
typedef struct xl_invalid_page_key
|
|
|
|
{
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
RelFileLocator locator; /* the relation */
|
2008-08-11 13:05:11 +02:00
|
|
|
ForkNumber forkno; /* the fork number */
|
2006-04-14 22:27:24 +02:00
|
|
|
BlockNumber blkno; /* the page */
|
|
|
|
} xl_invalid_page_key;
|
|
|
|
|
|
|
|
typedef struct xl_invalid_page
|
|
|
|
{
|
|
|
|
xl_invalid_page_key key; /* hash key ... must be first */
|
|
|
|
bool present; /* page existed but contained zeroes */
|
|
|
|
} xl_invalid_page;
|
|
|
|
|
|
|
|
static HTAB *invalid_page_tab = NULL;
|
|
|
|
|
2022-04-08 09:02:10 +02:00
|
|
|
static int read_local_xlog_page_guts(XLogReaderState *state, XLogRecPtr targetPagePtr,
|
|
|
|
int reqLen, XLogRecPtr targetRecPtr,
|
|
|
|
char *cur_page, bool wait_for_wal);
|
2022-03-29 15:36:21 +02:00
|
|
|
|
2011-12-02 09:49:54 +01:00
|
|
|
/* Report a reference to an invalid page */
|
|
|
|
static void
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
report_invalid_page(int elevel, RelFileLocator locator, ForkNumber forkno,
|
2011-12-02 09:49:54 +01:00
|
|
|
BlockNumber blkno, bool present)
|
|
|
|
{
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
char *path = relpathperm(locator, forkno);
|
2011-12-02 09:49:54 +01:00
|
|
|
|
|
|
|
if (present)
|
|
|
|
elog(elevel, "page %u of relation %s is uninitialized",
|
|
|
|
blkno, path);
|
|
|
|
else
|
|
|
|
elog(elevel, "page %u of relation %s does not exist",
|
|
|
|
blkno, path);
|
|
|
|
pfree(path);
|
|
|
|
}
|
|
|
|
|
2006-04-14 22:27:24 +02:00
|
|
|
/* Log a reference to an invalid page */
|
|
|
|
static void
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
log_invalid_page(RelFileLocator locator, ForkNumber forkno, BlockNumber blkno,
|
2008-08-11 13:05:11 +02:00
|
|
|
bool present)
|
2006-04-14 22:27:24 +02:00
|
|
|
{
|
|
|
|
xl_invalid_page_key key;
|
|
|
|
xl_invalid_page *hentry;
|
|
|
|
bool found;
|
|
|
|
|
2011-12-02 09:49:54 +01:00
|
|
|
/*
|
|
|
|
* Once recovery has reached a consistent state, the invalid-page table
|
|
|
|
* should be empty and remain so. If a reference to an invalid page is
|
|
|
|
* found after consistency is reached, PANIC immediately. This might seem
|
|
|
|
* aggressive, but it's better than letting the invalid reference linger
|
|
|
|
* in the hash table until the end of recovery and PANIC there, which
|
|
|
|
* might come only much later if this is a standby server.
|
|
|
|
*/
|
2011-12-09 13:32:42 +01:00
|
|
|
if (reachedConsistency)
|
2011-12-02 09:49:54 +01:00
|
|
|
{
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
report_invalid_page(WARNING, locator, forkno, blkno, present);
|
2020-01-22 03:56:34 +01:00
|
|
|
elog(ignore_invalid_pages ? WARNING : PANIC,
|
|
|
|
"WAL contains references to invalid pages");
|
2011-12-02 09:49:54 +01:00
|
|
|
}
|
|
|
|
|
2006-04-14 22:27:24 +02:00
|
|
|
/*
|
|
|
|
* Log references to invalid pages at DEBUG1 level. This allows some
|
|
|
|
* tracing of the cause (note the elog context mechanism will tell us
|
|
|
|
* something about the XLOG record that generated the reference).
|
|
|
|
*/
|
2020-11-24 01:04:07 +01:00
|
|
|
if (message_level_is_interesting(DEBUG1))
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
report_invalid_page(DEBUG1, locator, forkno, blkno, present);
|
2006-04-14 22:27:24 +02:00
|
|
|
|
|
|
|
if (invalid_page_tab == NULL)
|
|
|
|
{
|
|
|
|
/* create hash table when first needed */
|
|
|
|
HASHCTL ctl;
|
|
|
|
|
|
|
|
ctl.keysize = sizeof(xl_invalid_page_key);
|
|
|
|
ctl.entrysize = sizeof(xl_invalid_page);
|
|
|
|
|
|
|
|
invalid_page_tab = hash_create("XLOG invalid-page table",
|
|
|
|
100,
|
|
|
|
&ctl,
|
Improve hash_create's API for selecting simple-binary-key hash functions.
Previously, if you wanted anything besides C-string hash keys, you had to
specify a custom hashing function to hash_create(). Nearly all such
callers were specifying tag_hash or oid_hash; which is tedious, and rather
error-prone, since a caller could easily miss the opportunity to optimize
by using hash_uint32 when appropriate. Replace this with a design whereby
callers using simple binary-data keys just specify HASH_BLOBS and don't
need to mess with specific support functions. hash_create() itself will
take care of optimizing when the key size is four bytes.
This nets out saving a few hundred bytes of code space, and offers
a measurable performance improvement in tidbitmap.c (which was not
exploiting the opportunity to use hash_uint32 for its 4-byte keys).
There might be some wins elsewhere too, I didn't analyze closely.
In future we could look into offering a similar optimized hashing function
for 8-byte keys. Under this design that could be done in a centralized
and machine-independent fashion, whereas getting it right for keys of
platform-dependent sizes would've been notationally painful before.
For the moment, the old way still works fine, so as not to break source
code compatibility for loadable modules. Eventually we might want to
remove tag_hash and friends from the exported API altogether, since there's
no real need for them to be explicitly referenced from outside dynahash.c.
Teodor Sigaev and Tom Lane
2014-12-18 19:36:29 +01:00
|
|
|
HASH_ELEM | HASH_BLOBS);
|
2006-04-14 22:27:24 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* we currently assume xl_invalid_page_key contains no padding */
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
key.locator = locator;
|
2008-08-11 13:05:11 +02:00
|
|
|
key.forkno = forkno;
|
2006-04-14 22:27:24 +02:00
|
|
|
key.blkno = blkno;
|
|
|
|
hentry = (xl_invalid_page *)
|
|
|
|
hash_search(invalid_page_tab, (void *) &key, HASH_ENTER, &found);
|
|
|
|
|
|
|
|
if (!found)
|
|
|
|
{
|
|
|
|
/* hash_search already filled in the key */
|
|
|
|
hentry->present = present;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* repeat reference ... leave "present" as it was */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Forget any invalid pages >= minblkno, because they've been dropped */
|
|
|
|
static void
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
forget_invalid_pages(RelFileLocator locator, ForkNumber forkno,
|
|
|
|
BlockNumber minblkno)
|
2006-04-14 22:27:24 +02:00
|
|
|
{
|
|
|
|
HASH_SEQ_STATUS status;
|
|
|
|
xl_invalid_page *hentry;
|
|
|
|
|
|
|
|
if (invalid_page_tab == NULL)
|
|
|
|
return; /* nothing to do */
|
|
|
|
|
|
|
|
hash_seq_init(&status, invalid_page_tab);
|
|
|
|
|
|
|
|
while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
|
|
|
|
{
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
if (RelFileLocatorEquals(hentry->key.locator, locator) &&
|
2008-08-11 13:05:11 +02:00
|
|
|
hentry->key.forkno == forkno &&
|
2006-04-14 22:27:24 +02:00
|
|
|
hentry->key.blkno >= minblkno)
|
|
|
|
{
|
2020-11-24 01:04:07 +01:00
|
|
|
if (message_level_is_interesting(DEBUG2))
|
2008-11-11 14:19:16 +01:00
|
|
|
{
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
char *path = relpathperm(hentry->key.locator, forkno);
|
2009-06-11 16:49:15 +02:00
|
|
|
|
2008-11-11 14:19:16 +01:00
|
|
|
elog(DEBUG2, "page %u of relation %s has been dropped",
|
|
|
|
hentry->key.blkno, path);
|
|
|
|
pfree(path);
|
|
|
|
}
|
2006-04-14 22:27:24 +02:00
|
|
|
|
|
|
|
if (hash_search(invalid_page_tab,
|
|
|
|
(void *) &hentry->key,
|
|
|
|
HASH_REMOVE, NULL) == NULL)
|
|
|
|
elog(ERROR, "hash table corrupted");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Forget any invalid pages in a whole database */
|
|
|
|
static void
|
|
|
|
forget_invalid_pages_db(Oid dbid)
|
|
|
|
{
|
|
|
|
HASH_SEQ_STATUS status;
|
|
|
|
xl_invalid_page *hentry;
|
|
|
|
|
|
|
|
if (invalid_page_tab == NULL)
|
|
|
|
return; /* nothing to do */
|
|
|
|
|
|
|
|
hash_seq_init(&status, invalid_page_tab);
|
|
|
|
|
|
|
|
while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
|
|
|
|
{
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
if (hentry->key.locator.dbOid == dbid)
|
2006-04-14 22:27:24 +02:00
|
|
|
{
|
2020-11-24 01:04:07 +01:00
|
|
|
if (message_level_is_interesting(DEBUG2))
|
2008-11-11 14:19:16 +01:00
|
|
|
{
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
char *path = relpathperm(hentry->key.locator, hentry->key.forkno);
|
2009-06-11 16:49:15 +02:00
|
|
|
|
2008-11-11 14:19:16 +01:00
|
|
|
elog(DEBUG2, "page %u of relation %s has been dropped",
|
|
|
|
hentry->key.blkno, path);
|
|
|
|
pfree(path);
|
|
|
|
}
|
2006-04-14 22:27:24 +02:00
|
|
|
|
|
|
|
if (hash_search(invalid_page_tab,
|
|
|
|
(void *) &hentry->key,
|
|
|
|
HASH_REMOVE, NULL) == NULL)
|
|
|
|
elog(ERROR, "hash table corrupted");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-12-02 09:49:54 +01:00
|
|
|
/* Are there any unresolved references to invalid pages? */
|
|
|
|
bool
|
|
|
|
XLogHaveInvalidPages(void)
|
|
|
|
{
|
|
|
|
if (invalid_page_tab != NULL &&
|
|
|
|
hash_get_num_entries(invalid_page_tab) > 0)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2006-04-14 22:27:24 +02:00
|
|
|
/* Complain about any remaining invalid-page entries */
|
|
|
|
void
|
|
|
|
XLogCheckInvalidPages(void)
|
|
|
|
{
|
|
|
|
HASH_SEQ_STATUS status;
|
|
|
|
xl_invalid_page *hentry;
|
|
|
|
bool foundone = false;
|
|
|
|
|
|
|
|
if (invalid_page_tab == NULL)
|
|
|
|
return; /* nothing to do */
|
|
|
|
|
|
|
|
hash_seq_init(&status, invalid_page_tab);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Our strategy is to emit WARNING messages for all remaining entries and
|
|
|
|
* only PANIC after we've dumped all the available info.
|
|
|
|
*/
|
|
|
|
while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
|
|
|
|
{
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
report_invalid_page(WARNING, hentry->key.locator, hentry->key.forkno,
|
2011-12-02 09:49:54 +01:00
|
|
|
hentry->key.blkno, hentry->present);
|
2006-04-14 22:27:24 +02:00
|
|
|
foundone = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (foundone)
|
2020-01-22 03:56:34 +01:00
|
|
|
elog(ignore_invalid_pages ? WARNING : PANIC,
|
|
|
|
"WAL contains references to invalid pages");
|
2008-06-12 11:12:31 +02:00
|
|
|
|
|
|
|
hash_destroy(invalid_page_tab);
|
|
|
|
invalid_page_tab = NULL;
|
2006-04-14 22:27:24 +02:00
|
|
|
}
|
|
|
|
|
2014-08-13 14:39:08 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* XLogReadBufferForRedo
|
|
|
|
* Read a page during XLOG replay
|
|
|
|
*
|
|
|
|
* Reads a block referenced by a WAL record into shared buffer cache, and
|
|
|
|
* determines what needs to be done to redo the changes to it. If the WAL
|
|
|
|
* record includes a full-page image of the page, it is restored.
|
|
|
|
*
|
2020-08-14 20:09:08 +02:00
|
|
|
* 'record.EndRecPtr' is compared to the page's LSN to determine if the record
|
|
|
|
* has already been replayed. 'block_id' is the ID number the block was
|
|
|
|
* registered with, when the WAL record was created.
|
2014-08-13 14:39:08 +02:00
|
|
|
*
|
|
|
|
* Returns one of the following:
|
|
|
|
*
|
|
|
|
* BLK_NEEDS_REDO - changes from the WAL record need to be applied
|
|
|
|
* BLK_DONE - block doesn't need replaying
|
|
|
|
* BLK_RESTORED - block was restored from a full-page image included in
|
|
|
|
* the record
|
|
|
|
* BLK_NOTFOUND - block was not found (because it was truncated away by
|
|
|
|
* an operation later in the WAL stream)
|
|
|
|
*
|
|
|
|
* On return, the buffer is locked in exclusive-mode, and returned in *buf.
|
|
|
|
* Note that the buffer is locked and returned even if it doesn't need
|
|
|
|
* replaying. (Getting the buffer lock is not really necessary during
|
|
|
|
* single-process crash recovery, but some subroutines such as MarkBufferDirty
|
|
|
|
* will complain if we don't have the lock. In hot standby mode it's
|
|
|
|
* definitely necessary.)
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
*
|
2017-02-08 21:45:30 +01:00
|
|
|
* Note: when a backup block is available in XLOG with the BKPIMAGE_APPLY flag
|
|
|
|
* set, we restore it, even if the page in the database appears newer. This
|
|
|
|
* is to protect ourselves against database pages that were partially or
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
* incorrectly written during a crash. We assume that the XLOG data must be
|
|
|
|
* good because it has passed a CRC check, while the database page might not
|
|
|
|
* be. This will force us to replay all subsequent modifications of the page
|
|
|
|
* that appear in XLOG, rather than possibly ignoring them as already
|
|
|
|
* applied, but that's not a huge drawback.
|
2014-08-13 14:39:08 +02:00
|
|
|
*/
|
|
|
|
XLogRedoAction
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id,
|
2014-08-13 14:39:08 +02:00
|
|
|
Buffer *buf)
|
|
|
|
{
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
return XLogReadBufferForRedoExtended(record, block_id, RBM_NORMAL,
|
|
|
|
false, buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Pin and lock a buffer referenced by a WAL record, for the purpose of
|
|
|
|
* re-initializing it.
|
|
|
|
*/
|
|
|
|
Buffer
|
|
|
|
XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id)
|
|
|
|
{
|
|
|
|
Buffer buf;
|
|
|
|
|
|
|
|
XLogReadBufferForRedoExtended(record, block_id, RBM_ZERO_AND_LOCK, false,
|
|
|
|
&buf);
|
|
|
|
return buf;
|
2014-08-13 14:39:08 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* XLogReadBufferForRedoExtended
|
|
|
|
* Like XLogReadBufferForRedo, but with extra options.
|
|
|
|
*
|
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.
To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.
In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.
Backpatch to all supported versions; this has been racy since hot standby
was introduced.
2014-11-13 18:47:44 +01:00
|
|
|
* In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
|
|
|
|
* with all-zeroes pages up to the referenced block number. In
|
|
|
|
* RBM_ZERO_AND_LOCK and RBM_ZERO_AND_CLEANUP_LOCK modes, the return value
|
|
|
|
* is always BLK_NEEDS_REDO.
|
|
|
|
*
|
|
|
|
* (The RBM_ZERO_AND_CLEANUP_LOCK mode is redundant with the get_cleanup_lock
|
|
|
|
* parameter. Do not use an inconsistent combination!)
|
2014-08-13 14:39:08 +02:00
|
|
|
*
|
|
|
|
* If 'get_cleanup_lock' is true, a "cleanup lock" is acquired on the buffer
|
|
|
|
* using LockBufferForCleanup(), instead of a regular exclusive lock.
|
|
|
|
*/
|
|
|
|
XLogRedoAction
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogReadBufferForRedoExtended(XLogReaderState *record,
|
|
|
|
uint8 block_id,
|
2014-08-13 14:39:08 +02:00
|
|
|
ReadBufferMode mode, bool get_cleanup_lock,
|
|
|
|
Buffer *buf)
|
|
|
|
{
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogRecPtr lsn = record->EndRecPtr;
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
RelFileLocator rlocator;
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
ForkNumber forknum;
|
|
|
|
BlockNumber blkno;
|
2022-04-07 09:28:40 +02:00
|
|
|
Buffer prefetch_buffer;
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
Page page;
|
2015-07-20 15:02:28 +02:00
|
|
|
bool zeromode;
|
|
|
|
bool willinit;
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
if (!XLogRecGetBlockTagExtended(record, block_id, &rlocator, &forknum, &blkno,
|
2022-04-07 09:28:40 +02:00
|
|
|
&prefetch_buffer))
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
{
|
|
|
|
/* Caller specified a bogus block_id */
|
2022-04-11 23:43:46 +02:00
|
|
|
elog(PANIC, "failed to locate backup block with ID %d in WAL record",
|
|
|
|
block_id);
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
}
|
|
|
|
|
2015-07-20 15:02:28 +02:00
|
|
|
/*
|
|
|
|
* Make sure that if the block is marked with WILL_INIT, the caller is
|
|
|
|
* going to initialize it. And vice versa.
|
|
|
|
*/
|
|
|
|
zeromode = (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
|
2022-03-18 05:45:04 +01:00
|
|
|
willinit = (XLogRecGetBlock(record, block_id)->flags & BKPBLOCK_WILL_INIT) != 0;
|
2015-07-20 15:02:28 +02:00
|
|
|
if (willinit && !zeromode)
|
|
|
|
elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine");
|
|
|
|
if (!willinit && zeromode)
|
|
|
|
elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record");
|
|
|
|
|
2017-02-08 21:45:30 +01:00
|
|
|
/* If it has a full-page image and it should be restored, do it. */
|
|
|
|
if (XLogRecBlockImageApply(record, block_id))
|
2014-08-13 14:39:08 +02:00
|
|
|
{
|
2017-02-08 21:45:30 +01:00
|
|
|
Assert(XLogRecHasBlockImage(record, block_id));
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
*buf = XLogReadBufferExtended(rlocator, forknum, blkno,
|
2022-04-07 09:28:40 +02:00
|
|
|
get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK,
|
|
|
|
prefetch_buffer);
|
2016-04-20 15:31:19 +02:00
|
|
|
page = BufferGetPage(*buf);
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
if (!RestoreBlockImage(record, block_id, page))
|
|
|
|
elog(ERROR, "failed to restore block image");
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The page may be uninitialized. If so, we can't set the LSN because
|
|
|
|
* that would corrupt the page.
|
|
|
|
*/
|
|
|
|
if (!PageIsNew(page))
|
|
|
|
{
|
|
|
|
PageSetLSN(page, lsn);
|
|
|
|
}
|
|
|
|
|
|
|
|
MarkBufferDirty(*buf);
|
|
|
|
|
2015-12-10 16:25:12 +01:00
|
|
|
/*
|
|
|
|
* At the end of crash recovery the init forks of unlogged relations
|
|
|
|
* are copied, without going through shared buffers. So we need to
|
|
|
|
* force the on-disk state of init forks to always be in sync with the
|
|
|
|
* state in shared buffers.
|
|
|
|
*/
|
|
|
|
if (forknum == INIT_FORKNUM)
|
|
|
|
FlushOneBuffer(*buf);
|
|
|
|
|
2014-08-13 14:39:08 +02:00
|
|
|
return BLK_RESTORED;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
*buf = XLogReadBufferExtended(rlocator, forknum, blkno, mode, prefetch_buffer);
|
2014-08-13 14:39:08 +02:00
|
|
|
if (BufferIsValid(*buf))
|
|
|
|
{
|
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.
To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.
In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.
Backpatch to all supported versions; this has been racy since hot standby
was introduced.
2014-11-13 18:47:44 +01:00
|
|
|
if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK)
|
|
|
|
{
|
|
|
|
if (get_cleanup_lock)
|
|
|
|
LockBufferForCleanup(*buf);
|
|
|
|
else
|
|
|
|
LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE);
|
|
|
|
}
|
2016-04-20 15:31:19 +02:00
|
|
|
if (lsn <= PageGetLSN(BufferGetPage(*buf)))
|
2014-08-13 14:39:08 +02:00
|
|
|
return BLK_DONE;
|
|
|
|
else
|
|
|
|
return BLK_NEEDS_REDO;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
return BLK_NOTFOUND;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2000-10-20 13:01:21 +02:00
|
|
|
/*
|
2008-11-03 16:10:17 +01:00
|
|
|
* XLogReadBufferExtended
|
2006-03-29 23:17:39 +02:00
|
|
|
* Read a page during XLOG replay
|
2000-10-20 13:01:21 +02:00
|
|
|
*
|
2009-01-20 19:59:37 +01:00
|
|
|
* This is functionally comparable to ReadBufferExtended. There's some
|
|
|
|
* differences in the behavior wrt. the "mode" argument:
|
2006-03-29 23:17:39 +02:00
|
|
|
*
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
2008-10-31 16:05:00 +01:00
|
|
|
* In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we
|
|
|
|
* return InvalidBuffer. In this case the caller should silently skip the
|
|
|
|
* update on this page. (In this situation, we expect that the page was later
|
|
|
|
* dropped or truncated. If we don't see evidence of that later in the WAL
|
|
|
|
* sequence, we'll complain at the end of WAL replay.)
|
|
|
|
*
|
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.
To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.
In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.
Backpatch to all supported versions; this has been racy since hot standby
was introduced.
2014-11-13 18:47:44 +01:00
|
|
|
* In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
|
|
|
|
* with all-zeroes pages up to the given block number.
|
Fix multiple bugs in index page locking during hot-standby WAL replay.
In ordinary operation, VACUUM must be careful to take a cleanup lock on
each leaf page of a btree index; this ensures that no indexscans could
still be "in flight" to heap tuples due to be deleted. (Because of
possible index-tuple motion due to concurrent page splits, it's not enough
to lock only the pages we're deleting index tuples from.) In Hot Standby,
the WAL replay process must likewise lock every leaf page. There were
several bugs in the code for that:
* The replay scan might come across unused, all-zero pages in the index.
While btree_xlog_vacuum itself did the right thing (ie, nothing) with
such pages, xlogutils.c supposed that such pages must be corrupt and
would throw an error. This accounts for various reports of replication
failures with "PANIC: WAL contains references to invalid pages". To
fix, add a ReadBufferMode value that instructs XLogReadBufferExtended
not to complain when we're doing this.
* btree_xlog_vacuum performed the extra locking if standbyState ==
STANDBY_SNAPSHOT_READY, but that's not the correct test: we won't open up
for hot standby queries until the database has reached consistency, and
we don't want to do the extra locking till then either, for fear of reading
corrupted pages (which bufmgr.c would complain about). Fix by exporting a
new function from xlog.c that will report whether we're actually in hot
standby replay mode.
* To ensure full coverage of the index in the replay scan, btvacuumscan
would emit a dummy WAL record for the last page of the index, if no
vacuuming work had been done on that page. However, if the last page
of the index is all-zero, that would result in corruption of said page,
since the functions called on it weren't prepared to handle that case.
There's no need to lock any such pages, so change the logic to target
the last normal leaf page instead.
The first two of these bugs were diagnosed by Andres Freund, the other one
by me. Fixes based on ideas from Heikki Linnakangas and myself.
This has been wrong since Hot Standby was introduced, so back-patch to 9.0.
2014-01-14 23:34:47 +01:00
|
|
|
*
|
|
|
|
* In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
|
|
|
|
* exist, and we don't check for all-zeroes. Thus, no log entry is made
|
|
|
|
* to imply that the page should be dropped or truncated later.
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
*
|
2022-04-07 09:28:40 +02:00
|
|
|
* Optionally, recent_buffer can be used to provide a hint about the location
|
|
|
|
* of the page in the buffer pool; it does not have to be correct, but avoids
|
|
|
|
* a buffer mapping table probe if it is.
|
|
|
|
*
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
* NB: A redo function should normally not call this directly. To get a page
|
2016-05-12 21:02:49 +02:00
|
|
|
* to modify, use XLogReadBufferForRedoExtended instead. It is important that
|
|
|
|
* all pages modified by a WAL record are registered in the WAL records, or
|
2021-02-24 08:13:17 +01:00
|
|
|
* they will be invisible to tools that need to know which pages are modified.
|
2008-08-11 13:05:11 +02:00
|
|
|
*/
|
|
|
|
Buffer
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum,
|
2022-04-07 09:28:40 +02:00
|
|
|
BlockNumber blkno, ReadBufferMode mode,
|
|
|
|
Buffer recent_buffer)
|
2000-10-20 13:01:21 +02:00
|
|
|
{
|
2008-06-12 11:12:31 +02:00
|
|
|
BlockNumber lastblock;
|
2000-10-20 13:01:21 +02:00
|
|
|
Buffer buffer;
|
2008-06-12 11:12:31 +02:00
|
|
|
SMgrRelation smgr;
|
2000-10-20 13:01:21 +02:00
|
|
|
|
2006-03-29 23:17:39 +02:00
|
|
|
Assert(blkno != P_NEW);
|
|
|
|
|
2022-04-07 09:28:40 +02:00
|
|
|
/* Do we have a clue where the buffer might be already? */
|
|
|
|
if (BufferIsValid(recent_buffer) &&
|
|
|
|
mode == RBM_NORMAL &&
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
ReadRecentBuffer(rlocator, forknum, blkno, recent_buffer))
|
2022-04-07 09:28:40 +02:00
|
|
|
{
|
|
|
|
buffer = recent_buffer;
|
|
|
|
goto recent_buffer_fast_path;
|
|
|
|
}
|
|
|
|
|
2008-06-12 11:12:31 +02:00
|
|
|
/* Open the relation at smgr level */
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
smgr = smgropen(rlocator, InvalidBackendId);
|
2008-06-12 11:12:31 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Create the target file if it doesn't already exist. This lets us cope
|
|
|
|
* if the replay sequence contains writes to a relation that is later
|
|
|
|
* deleted. (The original coding of this routine would instead suppress
|
|
|
|
* the writes, but that seems like it risks losing valuable data if the
|
|
|
|
* filesystem loses an inode during a crash. Better to write the data
|
|
|
|
* until we are actually told to delete the file.)
|
|
|
|
*/
|
2008-11-19 11:34:52 +01:00
|
|
|
smgrcreate(smgr, forknum, true);
|
2008-06-12 11:12:31 +02:00
|
|
|
|
2008-08-11 13:05:11 +02:00
|
|
|
lastblock = smgrnblocks(smgr, forknum);
|
2008-06-12 11:12:31 +02:00
|
|
|
|
2006-03-29 23:17:39 +02:00
|
|
|
if (blkno < lastblock)
|
2000-10-20 13:01:21 +02:00
|
|
|
{
|
2006-03-29 23:17:39 +02:00
|
|
|
/* page exists in file */
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
buffer = ReadBufferWithoutRelcache(rlocator, forknum, blkno,
|
Add new block-by-block strategy for CREATE DATABASE.
Because this strategy logs changes on a block-by-block basis, it
avoids the need to checkpoint before and after the operation.
However, because it logs each changed block individually, it might
generate a lot of extra write-ahead logging if the template database
is large. Therefore, the older strategy remains available via a new
STRATEGY parameter to CREATE DATABASE, and a corresponding --strategy
option to createdb.
Somewhat controversially, this patch assembles the list of relations
to be copied to the new database by reading the pg_class relation of
the template database. Cross-database access like this isn't normally
possible, but it can be made to work here because there can't be any
connections to the database being copied, nor can it contain any
in-doubt transactions. Even so, we have to use lower-level interfaces
than normal, since the table scan and relcache interfaces will not
work for a database to which we're not connected. The advantage of
this approach is that we do not need to rely on the filesystem to
determine what ought to be copied, but instead on PostgreSQL's own
knowledge of the database structure. This avoids, for example,
copying stray files that happen to be located in the source database
directory.
Dilip Kumar, with a fairly large number of cosmetic changes by me.
Reviewed and tested by Ashutosh Sharma, Andres Freund, John Naylor,
Greg Nancarrow, Neha Sharma. Additional feedback from Bruce Momjian,
Heikki Linnakangas, Julien Rouhaud, Adam Brusselback, Kyotaro
Horiguchi, Tomas Vondra, Andrew Dunstan, Álvaro Herrera, and others.
Discussion: http://postgr.es/m/CA+TgmoYtcdxBjLh31DLxUXHxFVMPGzrU5_T=CYCvRyFHywSBUQ@mail.gmail.com
2022-03-29 17:31:43 +02:00
|
|
|
mode, NULL, true);
|
2006-03-29 23:17:39 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* hm, page doesn't exist in file */
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
2008-10-31 16:05:00 +01:00
|
|
|
if (mode == RBM_NORMAL)
|
2006-04-14 22:27:24 +02:00
|
|
|
{
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
log_invalid_page(rlocator, forknum, blkno, false);
|
2006-04-14 22:27:24 +02:00
|
|
|
return InvalidBuffer;
|
|
|
|
}
|
Fix multiple bugs in index page locking during hot-standby WAL replay.
In ordinary operation, VACUUM must be careful to take a cleanup lock on
each leaf page of a btree index; this ensures that no indexscans could
still be "in flight" to heap tuples due to be deleted. (Because of
possible index-tuple motion due to concurrent page splits, it's not enough
to lock only the pages we're deleting index tuples from.) In Hot Standby,
the WAL replay process must likewise lock every leaf page. There were
several bugs in the code for that:
* The replay scan might come across unused, all-zero pages in the index.
While btree_xlog_vacuum itself did the right thing (ie, nothing) with
such pages, xlogutils.c supposed that such pages must be corrupt and
would throw an error. This accounts for various reports of replication
failures with "PANIC: WAL contains references to invalid pages". To
fix, add a ReadBufferMode value that instructs XLogReadBufferExtended
not to complain when we're doing this.
* btree_xlog_vacuum performed the extra locking if standbyState ==
STANDBY_SNAPSHOT_READY, but that's not the correct test: we won't open up
for hot standby queries until the database has reached consistency, and
we don't want to do the extra locking till then either, for fear of reading
corrupted pages (which bufmgr.c would complain about). Fix by exporting a
new function from xlog.c that will report whether we're actually in hot
standby replay mode.
* To ensure full coverage of the index in the replay scan, btvacuumscan
would emit a dummy WAL record for the last page of the index, if no
vacuuming work had been done on that page. However, if the last page
of the index is all-zero, that would result in corruption of said page,
since the functions called on it weren't prepared to handle that case.
There's no need to lock any such pages, so change the logic to target
the last normal leaf page instead.
The first two of these bugs were diagnosed by Andres Freund, the other one
by me. Fixes based on ideas from Heikki Linnakangas and myself.
This has been wrong since Hot Standby was introduced, so back-patch to 9.0.
2014-01-14 23:34:47 +01:00
|
|
|
if (mode == RBM_NORMAL_NO_LOG)
|
|
|
|
return InvalidBuffer;
|
2006-03-29 23:17:39 +02:00
|
|
|
/* OK to extend the file */
|
|
|
|
/* we do this in recovery only - no rel-extension lock needed */
|
|
|
|
Assert(InRecovery);
|
2000-10-20 13:01:21 +02:00
|
|
|
buffer = InvalidBuffer;
|
In XLogReadBufferExtended, don't assume P_NEW yields consecutive pages.
In a database that's not yet reached consistency, it's possible that some
segments of a relation are not full-size but are not the last ones either.
Because of the way smgrnblocks() works, asking for a new page with P_NEW
will fill in the last not-full-size segment --- and if that makes it full
size, the apparent EOF of the relation will increase by more than one page,
so that the next P_NEW request will yield a page past the next consecutive
one. This breaks the relation-extension logic in XLogReadBufferExtended,
possibly allowing a page update to be applied to some page far past where
it was intended to go. This appears to be the explanation for reports of
table bloat on replication slaves compared to their masters, and probably
explains some corrupted-slave reports as well.
Fix the loop to check the page number it actually got, rather than merely
Assert()'ing that dead reckoning got it to the desired place. AFAICT,
there are no other places that make assumptions about exactly which page
they'll get from P_NEW.
Problem identified by Greg Stark, though this is not the same as his
proposed patch.
It's been like this for a long time, so back-patch to all supported
branches.
2014-02-12 20:52:16 +01:00
|
|
|
do
|
2000-10-20 13:01:21 +02:00
|
|
|
{
|
2006-03-29 23:17:39 +02:00
|
|
|
if (buffer != InvalidBuffer)
|
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.
To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.
In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.
Backpatch to all supported versions; this has been racy since hot standby
was introduced.
2014-11-13 18:47:44 +01:00
|
|
|
{
|
|
|
|
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
|
|
|
|
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
2006-04-01 01:32:07 +02:00
|
|
|
ReleaseBuffer(buffer);
|
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.
To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.
In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.
Backpatch to all supported versions; this has been racy since hot standby
was introduced.
2014-11-13 18:47:44 +01:00
|
|
|
}
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
buffer = ReadBufferWithoutRelcache(rlocator, forknum,
|
Add new block-by-block strategy for CREATE DATABASE.
Because this strategy logs changes on a block-by-block basis, it
avoids the need to checkpoint before and after the operation.
However, because it logs each changed block individually, it might
generate a lot of extra write-ahead logging if the template database
is large. Therefore, the older strategy remains available via a new
STRATEGY parameter to CREATE DATABASE, and a corresponding --strategy
option to createdb.
Somewhat controversially, this patch assembles the list of relations
to be copied to the new database by reading the pg_class relation of
the template database. Cross-database access like this isn't normally
possible, but it can be made to work here because there can't be any
connections to the database being copied, nor can it contain any
in-doubt transactions. Even so, we have to use lower-level interfaces
than normal, since the table scan and relcache interfaces will not
work for a database to which we're not connected. The advantage of
this approach is that we do not need to rely on the filesystem to
determine what ought to be copied, but instead on PostgreSQL's own
knowledge of the database structure. This avoids, for example,
copying stray files that happen to be located in the source database
directory.
Dilip Kumar, with a fairly large number of cosmetic changes by me.
Reviewed and tested by Ashutosh Sharma, Andres Freund, John Naylor,
Greg Nancarrow, Neha Sharma. Additional feedback from Bruce Momjian,
Heikki Linnakangas, Julien Rouhaud, Adam Brusselback, Kyotaro
Horiguchi, Tomas Vondra, Andrew Dunstan, Álvaro Herrera, and others.
Discussion: http://postgr.es/m/CA+TgmoYtcdxBjLh31DLxUXHxFVMPGzrU5_T=CYCvRyFHywSBUQ@mail.gmail.com
2022-03-29 17:31:43 +02:00
|
|
|
P_NEW, mode, NULL, true);
|
2000-10-20 13:01:21 +02:00
|
|
|
}
|
In XLogReadBufferExtended, don't assume P_NEW yields consecutive pages.
In a database that's not yet reached consistency, it's possible that some
segments of a relation are not full-size but are not the last ones either.
Because of the way smgrnblocks() works, asking for a new page with P_NEW
will fill in the last not-full-size segment --- and if that makes it full
size, the apparent EOF of the relation will increase by more than one page,
so that the next P_NEW request will yield a page past the next consecutive
one. This breaks the relation-extension logic in XLogReadBufferExtended,
possibly allowing a page update to be applied to some page far past where
it was intended to go. This appears to be the explanation for reports of
table bloat on replication slaves compared to their masters, and probably
explains some corrupted-slave reports as well.
Fix the loop to check the page number it actually got, rather than merely
Assert()'ing that dead reckoning got it to the desired place. AFAICT,
there are no other places that make assumptions about exactly which page
they'll get from P_NEW.
Problem identified by Greg Stark, though this is not the same as his
proposed patch.
It's been like this for a long time, so back-patch to all supported
branches.
2014-02-12 20:52:16 +01:00
|
|
|
while (BufferGetBlockNumber(buffer) < blkno);
|
|
|
|
/* Handle the corner case that P_NEW returns non-consecutive pages */
|
|
|
|
if (BufferGetBlockNumber(buffer) != blkno)
|
|
|
|
{
|
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.
To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.
In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.
Backpatch to all supported versions; this has been racy since hot standby
was introduced.
2014-11-13 18:47:44 +01:00
|
|
|
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
|
|
|
|
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
In XLogReadBufferExtended, don't assume P_NEW yields consecutive pages.
In a database that's not yet reached consistency, it's possible that some
segments of a relation are not full-size but are not the last ones either.
Because of the way smgrnblocks() works, asking for a new page with P_NEW
will fill in the last not-full-size segment --- and if that makes it full
size, the apparent EOF of the relation will increase by more than one page,
so that the next P_NEW request will yield a page past the next consecutive
one. This breaks the relation-extension logic in XLogReadBufferExtended,
possibly allowing a page update to be applied to some page far past where
it was intended to go. This appears to be the explanation for reports of
table bloat on replication slaves compared to their masters, and probably
explains some corrupted-slave reports as well.
Fix the loop to check the page number it actually got, rather than merely
Assert()'ing that dead reckoning got it to the desired place. AFAICT,
there are no other places that make assumptions about exactly which page
they'll get from P_NEW.
Problem identified by Greg Stark, though this is not the same as his
proposed patch.
It's been like this for a long time, so back-patch to all supported
branches.
2014-02-12 20:52:16 +01:00
|
|
|
ReleaseBuffer(buffer);
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
buffer = ReadBufferWithoutRelcache(rlocator, forknum, blkno,
|
Add new block-by-block strategy for CREATE DATABASE.
Because this strategy logs changes on a block-by-block basis, it
avoids the need to checkpoint before and after the operation.
However, because it logs each changed block individually, it might
generate a lot of extra write-ahead logging if the template database
is large. Therefore, the older strategy remains available via a new
STRATEGY parameter to CREATE DATABASE, and a corresponding --strategy
option to createdb.
Somewhat controversially, this patch assembles the list of relations
to be copied to the new database by reading the pg_class relation of
the template database. Cross-database access like this isn't normally
possible, but it can be made to work here because there can't be any
connections to the database being copied, nor can it contain any
in-doubt transactions. Even so, we have to use lower-level interfaces
than normal, since the table scan and relcache interfaces will not
work for a database to which we're not connected. The advantage of
this approach is that we do not need to rely on the filesystem to
determine what ought to be copied, but instead on PostgreSQL's own
knowledge of the database structure. This avoids, for example,
copying stray files that happen to be located in the source database
directory.
Dilip Kumar, with a fairly large number of cosmetic changes by me.
Reviewed and tested by Ashutosh Sharma, Andres Freund, John Naylor,
Greg Nancarrow, Neha Sharma. Additional feedback from Bruce Momjian,
Heikki Linnakangas, Julien Rouhaud, Adam Brusselback, Kyotaro
Horiguchi, Tomas Vondra, Andrew Dunstan, Álvaro Herrera, and others.
Discussion: http://postgr.es/m/CA+TgmoYtcdxBjLh31DLxUXHxFVMPGzrU5_T=CYCvRyFHywSBUQ@mail.gmail.com
2022-03-29 17:31:43 +02:00
|
|
|
mode, NULL, true);
|
In XLogReadBufferExtended, don't assume P_NEW yields consecutive pages.
In a database that's not yet reached consistency, it's possible that some
segments of a relation are not full-size but are not the last ones either.
Because of the way smgrnblocks() works, asking for a new page with P_NEW
will fill in the last not-full-size segment --- and if that makes it full
size, the apparent EOF of the relation will increase by more than one page,
so that the next P_NEW request will yield a page past the next consecutive
one. This breaks the relation-extension logic in XLogReadBufferExtended,
possibly allowing a page update to be applied to some page far past where
it was intended to go. This appears to be the explanation for reports of
table bloat on replication slaves compared to their masters, and probably
explains some corrupted-slave reports as well.
Fix the loop to check the page number it actually got, rather than merely
Assert()'ing that dead reckoning got it to the desired place. AFAICT,
there are no other places that make assumptions about exactly which page
they'll get from P_NEW.
Problem identified by Greg Stark, though this is not the same as his
proposed patch.
It's been like this for a long time, so back-patch to all supported
branches.
2014-02-12 20:52:16 +01:00
|
|
|
}
|
2006-03-29 23:17:39 +02:00
|
|
|
}
|
|
|
|
|
2022-04-07 09:28:40 +02:00
|
|
|
recent_buffer_fast_path:
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
2008-10-31 16:05:00 +01:00
|
|
|
if (mode == RBM_NORMAL)
|
2006-03-29 23:17:39 +02:00
|
|
|
{
|
|
|
|
/* check that page has been initialized */
|
2016-04-20 15:31:19 +02:00
|
|
|
Page page = (Page) BufferGetPage(buffer);
|
2006-03-29 23:17:39 +02:00
|
|
|
|
2009-01-20 19:59:37 +01:00
|
|
|
/*
|
|
|
|
* We assume that PageIsNew is safe without a lock. During recovery,
|
|
|
|
* there should be no other backends that could modify the buffer at
|
|
|
|
* the same time.
|
|
|
|
*/
|
2008-07-13 22:45:47 +02:00
|
|
|
if (PageIsNew(page))
|
2006-04-14 22:27:24 +02:00
|
|
|
{
|
2009-01-20 19:59:37 +01:00
|
|
|
ReleaseBuffer(buffer);
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
log_invalid_page(rlocator, forknum, blkno, true);
|
2006-04-14 22:27:24 +02:00
|
|
|
return InvalidBuffer;
|
|
|
|
}
|
2000-10-20 13:01:21 +02:00
|
|
|
}
|
|
|
|
|
2006-01-11 09:43:13 +01:00
|
|
|
return buffer;
|
2000-10-20 13:01:21 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2019-08-19 09:21:39 +02:00
|
|
|
* Struct actually returned by CreateFakeRelcacheEntry, though the declared
|
2008-06-12 11:12:31 +02:00
|
|
|
* return type is Relation.
|
2000-10-20 13:01:21 +02:00
|
|
|
*/
|
2008-06-12 11:12:31 +02:00
|
|
|
typedef struct
|
2000-10-20 13:01:21 +02:00
|
|
|
{
|
2008-06-12 11:12:31 +02:00
|
|
|
RelationData reldata; /* Note: this must be first */
|
|
|
|
FormData_pg_class pgc;
|
|
|
|
} FakeRelCacheEntryData;
|
2000-10-20 13:01:21 +02:00
|
|
|
|
2008-06-12 11:12:31 +02:00
|
|
|
typedef FakeRelCacheEntryData *FakeRelCacheEntry;
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2008-06-12 11:12:31 +02:00
|
|
|
/*
|
|
|
|
* Create a fake relation cache entry for a physical relation
|
|
|
|
*
|
|
|
|
* It's often convenient to use the same functions in XLOG replay as in the
|
|
|
|
* main codepath, but those functions typically work with a relcache entry.
|
|
|
|
* We don't have a working relation cache during XLOG replay, but this
|
|
|
|
* function can be used to create a fake relcache entry instead. Only the
|
|
|
|
* fields related to physical storage, like rd_rel, are initialized, so the
|
|
|
|
* fake entry is only usable in low-level operations like ReadBuffer().
|
|
|
|
*
|
Skip WAL for new relfilenodes, under wal_level=minimal.
Until now, only selected bulk operations (e.g. COPY) did this. If a
given relfilenode received both a WAL-skipping COPY and a WAL-logged
operation (e.g. INSERT), recovery could lose tuples from the COPY. See
src/backend/access/transam/README section "Skipping WAL for New
RelFileNode" for the new coding rules. Maintainers of table access
methods should examine that section.
To maintain data durability, just before commit, we choose between an
fsync of the relfilenode and copying its contents to WAL. A new GUC,
wal_skip_threshold, guides that choice. If this change slows a workload
that creates small, permanent relfilenodes under wal_level=minimal, try
adjusting wal_skip_threshold. Users setting a timeout on COMMIT may
need to adjust that timeout, and log_min_duration_statement analysis
will reflect time consumption moving to COMMIT from commands like COPY.
Internally, this requires a reliable determination of whether
RollbackAndReleaseCurrentSubTransaction() would unlink a relation's
current relfilenode. Introduce rd_firstRelfilenodeSubid. Amend the
specification of rd_createSubid such that the field is zero when a new
rel has an old rd_node. Make relcache.c retain entries for certain
dropped relations until end of transaction.
Bump XLOG_PAGE_MAGIC, since this introduces XLOG_GIST_ASSIGN_LSN.
Future servers accept older WAL, so this bump is discretionary.
Kyotaro Horiguchi, reviewed (in earlier, similar versions) by Robert
Haas. Heikki Linnakangas and Michael Paquier implemented earlier
designs that materially clarified the problem. Reviewed, in earlier
designs, by Andrew Dunstan, Andres Freund, Alvaro Herrera, Tom Lane,
Fujii Masao, and Simon Riggs. Reported by Martijn van Oosterhout.
Discussion: https://postgr.es/m/20150702220524.GA9392@svana.org
2020-04-04 21:25:34 +02:00
|
|
|
* This is also used for syncing WAL-skipped files.
|
|
|
|
*
|
2008-06-12 11:12:31 +02:00
|
|
|
* Caller must free the returned entry with FreeFakeRelcacheEntry().
|
|
|
|
*/
|
|
|
|
Relation
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
CreateFakeRelcacheEntry(RelFileLocator rlocator)
|
2000-10-28 18:21:00 +02:00
|
|
|
{
|
2008-06-12 11:12:31 +02:00
|
|
|
FakeRelCacheEntry fakeentry;
|
|
|
|
Relation rel;
|
2000-10-20 13:01:21 +02:00
|
|
|
|
2008-06-12 11:12:31 +02:00
|
|
|
/* Allocate the Relation struct and all related space in one block. */
|
|
|
|
fakeentry = palloc0(sizeof(FakeRelCacheEntryData));
|
|
|
|
rel = (Relation) fakeentry;
|
2000-10-28 18:21:00 +02:00
|
|
|
|
2008-06-12 11:12:31 +02:00
|
|
|
rel->rd_rel = &fakeentry->pgc;
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
rel->rd_locator = rlocator;
|
Skip WAL for new relfilenodes, under wal_level=minimal.
Until now, only selected bulk operations (e.g. COPY) did this. If a
given relfilenode received both a WAL-skipping COPY and a WAL-logged
operation (e.g. INSERT), recovery could lose tuples from the COPY. See
src/backend/access/transam/README section "Skipping WAL for New
RelFileNode" for the new coding rules. Maintainers of table access
methods should examine that section.
To maintain data durability, just before commit, we choose between an
fsync of the relfilenode and copying its contents to WAL. A new GUC,
wal_skip_threshold, guides that choice. If this change slows a workload
that creates small, permanent relfilenodes under wal_level=minimal, try
adjusting wal_skip_threshold. Users setting a timeout on COMMIT may
need to adjust that timeout, and log_min_duration_statement analysis
will reflect time consumption moving to COMMIT from commands like COPY.
Internally, this requires a reliable determination of whether
RollbackAndReleaseCurrentSubTransaction() would unlink a relation's
current relfilenode. Introduce rd_firstRelfilenodeSubid. Amend the
specification of rd_createSubid such that the field is zero when a new
rel has an old rd_node. Make relcache.c retain entries for certain
dropped relations until end of transaction.
Bump XLOG_PAGE_MAGIC, since this introduces XLOG_GIST_ASSIGN_LSN.
Future servers accept older WAL, so this bump is discretionary.
Kyotaro Horiguchi, reviewed (in earlier, similar versions) by Robert
Haas. Heikki Linnakangas and Michael Paquier implemented earlier
designs that materially clarified the problem. Reviewed, in earlier
designs, by Andrew Dunstan, Andres Freund, Alvaro Herrera, Tom Lane,
Fujii Masao, and Simon Riggs. Reported by Martijn van Oosterhout.
Discussion: https://postgr.es/m/20150702220524.GA9392@svana.org
2020-04-04 21:25:34 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We will never be working with temp rels during recovery or while
|
|
|
|
* syncing WAL-skipped files.
|
|
|
|
*/
|
2010-08-30 18:46:23 +02:00
|
|
|
rel->rd_backend = InvalidBackendId;
|
2001-10-05 19:28:13 +02:00
|
|
|
|
Skip WAL for new relfilenodes, under wal_level=minimal.
Until now, only selected bulk operations (e.g. COPY) did this. If a
given relfilenode received both a WAL-skipping COPY and a WAL-logged
operation (e.g. INSERT), recovery could lose tuples from the COPY. See
src/backend/access/transam/README section "Skipping WAL for New
RelFileNode" for the new coding rules. Maintainers of table access
methods should examine that section.
To maintain data durability, just before commit, we choose between an
fsync of the relfilenode and copying its contents to WAL. A new GUC,
wal_skip_threshold, guides that choice. If this change slows a workload
that creates small, permanent relfilenodes under wal_level=minimal, try
adjusting wal_skip_threshold. Users setting a timeout on COMMIT may
need to adjust that timeout, and log_min_duration_statement analysis
will reflect time consumption moving to COMMIT from commands like COPY.
Internally, this requires a reliable determination of whether
RollbackAndReleaseCurrentSubTransaction() would unlink a relation's
current relfilenode. Introduce rd_firstRelfilenodeSubid. Amend the
specification of rd_createSubid such that the field is zero when a new
rel has an old rd_node. Make relcache.c retain entries for certain
dropped relations until end of transaction.
Bump XLOG_PAGE_MAGIC, since this introduces XLOG_GIST_ASSIGN_LSN.
Future servers accept older WAL, so this bump is discretionary.
Kyotaro Horiguchi, reviewed (in earlier, similar versions) by Robert
Haas. Heikki Linnakangas and Michael Paquier implemented earlier
designs that materially clarified the problem. Reviewed, in earlier
designs, by Andrew Dunstan, Andres Freund, Alvaro Herrera, Tom Lane,
Fujii Masao, and Simon Riggs. Reported by Martijn van Oosterhout.
Discussion: https://postgr.es/m/20150702220524.GA9392@svana.org
2020-04-04 21:25:34 +02:00
|
|
|
/* It must be a permanent table here */
|
2012-09-14 15:35:07 +02:00
|
|
|
rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT;
|
|
|
|
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
/* We don't know the name of the relation; use relfilenumber instead */
|
|
|
|
sprintf(RelationGetRelationName(rel), "%u", rlocator.relNumber);
|
2001-10-05 19:28:13 +02:00
|
|
|
|
2008-06-12 11:12:31 +02:00
|
|
|
/*
|
|
|
|
* We set up the lockRelId in case anything tries to lock the dummy
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
* relation. Note that this is fairly bogus since relNumber may be
|
Skip WAL for new relfilenodes, under wal_level=minimal.
Until now, only selected bulk operations (e.g. COPY) did this. If a
given relfilenode received both a WAL-skipping COPY and a WAL-logged
operation (e.g. INSERT), recovery could lose tuples from the COPY. See
src/backend/access/transam/README section "Skipping WAL for New
RelFileNode" for the new coding rules. Maintainers of table access
methods should examine that section.
To maintain data durability, just before commit, we choose between an
fsync of the relfilenode and copying its contents to WAL. A new GUC,
wal_skip_threshold, guides that choice. If this change slows a workload
that creates small, permanent relfilenodes under wal_level=minimal, try
adjusting wal_skip_threshold. Users setting a timeout on COMMIT may
need to adjust that timeout, and log_min_duration_statement analysis
will reflect time consumption moving to COMMIT from commands like COPY.
Internally, this requires a reliable determination of whether
RollbackAndReleaseCurrentSubTransaction() would unlink a relation's
current relfilenode. Introduce rd_firstRelfilenodeSubid. Amend the
specification of rd_createSubid such that the field is zero when a new
rel has an old rd_node. Make relcache.c retain entries for certain
dropped relations until end of transaction.
Bump XLOG_PAGE_MAGIC, since this introduces XLOG_GIST_ASSIGN_LSN.
Future servers accept older WAL, so this bump is discretionary.
Kyotaro Horiguchi, reviewed (in earlier, similar versions) by Robert
Haas. Heikki Linnakangas and Michael Paquier implemented earlier
designs that materially clarified the problem. Reviewed, in earlier
designs, by Andrew Dunstan, Andres Freund, Alvaro Herrera, Tom Lane,
Fujii Masao, and Simon Riggs. Reported by Martijn van Oosterhout.
Discussion: https://postgr.es/m/20150702220524.GA9392@svana.org
2020-04-04 21:25:34 +02:00
|
|
|
* different from the relation's OID. It shouldn't really matter though.
|
|
|
|
* In recovery, we are running by ourselves and can't have any lock
|
|
|
|
* conflicts. While syncing, we already hold AccessExclusiveLock.
|
2008-06-12 11:12:31 +02:00
|
|
|
*/
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
rel->rd_lockInfo.lockRelId.dbId = rlocator.dbOid;
|
|
|
|
rel->rd_lockInfo.lockRelId.relId = rlocator.relNumber;
|
2000-10-28 18:21:00 +02:00
|
|
|
|
2008-06-12 11:12:31 +02:00
|
|
|
rel->rd_smgr = NULL;
|
2000-10-28 18:21:00 +02:00
|
|
|
|
2008-06-12 11:12:31 +02:00
|
|
|
return rel;
|
2000-10-20 13:01:21 +02:00
|
|
|
}
|
|
|
|
|
2004-02-11 23:55:26 +01:00
|
|
|
/*
|
2008-06-12 11:12:31 +02:00
|
|
|
* Free a fake relation cache entry.
|
2004-02-11 23:55:26 +01:00
|
|
|
*/
|
2008-06-12 11:12:31 +02:00
|
|
|
void
|
|
|
|
FreeFakeRelcacheEntry(Relation fakerel)
|
2000-10-20 13:01:21 +02:00
|
|
|
{
|
2014-03-07 12:25:11 +01:00
|
|
|
/* make sure the fakerel is not referenced by the SmgrRelation anymore */
|
|
|
|
if (fakerel->rd_smgr != NULL)
|
|
|
|
smgrclearowner(&fakerel->rd_smgr, fakerel->rd_smgr);
|
2008-06-12 11:12:31 +02:00
|
|
|
pfree(fakerel);
|
2000-10-20 13:01:21 +02:00
|
|
|
}
|
2004-02-11 23:55:26 +01:00
|
|
|
|
|
|
|
/*
|
2006-03-29 23:17:39 +02:00
|
|
|
* Drop a relation during XLOG replay
|
2004-02-11 23:55:26 +01:00
|
|
|
*
|
2008-06-12 11:12:31 +02:00
|
|
|
* This is called when the relation is about to be deleted; we need to remove
|
|
|
|
* any open "invalid-page" records for the relation.
|
2004-02-11 23:55:26 +01:00
|
|
|
*/
|
|
|
|
void
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
XLogDropRelation(RelFileLocator rlocator, ForkNumber forknum)
|
2004-02-11 23:55:26 +01:00
|
|
|
{
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
forget_invalid_pages(rlocator, forknum, 0);
|
2004-02-11 23:55:26 +01:00
|
|
|
}
|
2006-03-29 23:17:39 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Drop a whole database during XLOG replay
|
|
|
|
*
|
|
|
|
* As above, but for DROP DATABASE instead of dropping a single rel
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
XLogDropDatabase(Oid dbid)
|
|
|
|
{
|
2008-06-12 11:12:31 +02:00
|
|
|
/*
|
|
|
|
* This is unnecessarily heavy-handed, as it will close SMgrRelation
|
|
|
|
* objects for other databases as well. DROP DATABASE occurs seldom enough
|
|
|
|
* that it's not worth introducing a variant of smgrclose for just this
|
|
|
|
* purpose. XXX: Or should we rather leave the smgr entries dangling?
|
|
|
|
*/
|
|
|
|
smgrcloseall();
|
2006-04-14 22:27:24 +02:00
|
|
|
|
|
|
|
forget_invalid_pages_db(dbid);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Truncate a relation during XLOG replay
|
|
|
|
*
|
2008-06-12 11:12:31 +02:00
|
|
|
* We need to clean up any open "invalid-page" records for the dropped pages.
|
2006-04-14 22:27:24 +02:00
|
|
|
*/
|
|
|
|
void
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
XLogTruncateRelation(RelFileLocator rlocator, ForkNumber forkNum,
|
2008-08-11 13:05:11 +02:00
|
|
|
BlockNumber nblocks)
|
2006-04-14 22:27:24 +02:00
|
|
|
{
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
forget_invalid_pages(rlocator, forkNum, nblocks);
|
2006-03-29 23:17:39 +02:00
|
|
|
}
|
2016-01-21 02:18:58 +01:00
|
|
|
|
2017-03-22 08:05:12 +01:00
|
|
|
/*
|
|
|
|
* Determine which timeline to read an xlog page from and set the
|
|
|
|
* XLogReaderState's currTLI to that timeline ID.
|
|
|
|
*
|
|
|
|
* We care about timelines in xlogreader when we might be reading xlog
|
|
|
|
* generated prior to a promotion, either if we're currently a standby in
|
2020-06-14 23:05:18 +02:00
|
|
|
* recovery or if we're a promoted primary reading xlogs generated by the old
|
|
|
|
* primary before our promotion.
|
2017-03-22 08:05:12 +01:00
|
|
|
*
|
|
|
|
* wantPage must be set to the start address of the page to read and
|
|
|
|
* wantLength to the amount of the page that will be read, up to
|
|
|
|
* XLOG_BLCKSZ. If the amount to be read isn't known, pass XLOG_BLCKSZ.
|
|
|
|
*
|
Remove all use of ThisTimeLineID global variable outside of xlog.c
All such code deals with this global variable in one of three ways.
Sometimes the same functions use it in more than one of these ways
at the same time.
First, sometimes it's an implicit argument to one or more functions
being called in xlog.c or elsewhere, and must be set to the
appropriate value before calling those functions lest they
misbehave. In those cases, it is now passed as an explicit argument
instead.
Second, sometimes it's used to obtain the current timeline after
the end of recovery, i.e. the timeline to which WAL is being
written and flushed. Such code now calls GetWALInsertionTimeLine()
or relies on the new out parameter added to GetFlushRecPtr().
Third, sometimes it's used during recovery to store the current
replay timeline. That can change, so such code must generally
update the value before each use. It can still do that, but must
now use a local variable instead.
The net effect of these changes is to reduce by a fair amount the
amount of code that is directly accessing this global variable.
That's good, because history has shown that we don't always think
clearly about which timeline ID it's supposed to contain at any
given point in time, or indeed, whether it has been or needs to
be initialized at any given point in the code.
Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and
Álvaro Herrera.
Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
|
|
|
* The currTLI argument should be the system-wide current timeline.
|
|
|
|
* Note that this may be different from state->currTLI, which is the timeline
|
|
|
|
* from which the caller is currently reading previous xlog records.
|
|
|
|
*
|
2017-03-22 08:05:12 +01:00
|
|
|
* We switch to an xlog segment from the new timeline eagerly when on a
|
|
|
|
* historical timeline, as soon as we reach the start of the xlog segment
|
|
|
|
* containing the timeline switch. The server copied the segment to the new
|
|
|
|
* timeline so all the data up to the switch point is the same, but there's no
|
|
|
|
* guarantee the old segment will still exist. It may have been deleted or
|
|
|
|
* renamed with a .partial suffix so we can't necessarily keep reading from
|
|
|
|
* the old TLI even though tliSwitchPoint says it's OK.
|
|
|
|
*
|
|
|
|
* We can't just check the timeline when we read a page on a different segment
|
|
|
|
* to the last page. We could've received a timeline switch from a cascading
|
2017-04-17 01:47:37 +02:00
|
|
|
* upstream, so the current segment ends abruptly (possibly getting renamed to
|
2017-03-22 08:05:12 +01:00
|
|
|
* .partial) and we have to switch to a new one. Even in the middle of reading
|
|
|
|
* a page we could have to dump the cached page and switch to a new TLI.
|
|
|
|
*
|
|
|
|
* Because of this, callers MAY NOT assume that currTLI is the timeline that
|
|
|
|
* will be in a page's xlp_tli; the page may begin on an older timeline or we
|
|
|
|
* might be reading from historical timeline data on a segment that's been
|
|
|
|
* copied to a new timeline.
|
|
|
|
*
|
|
|
|
* The caller must also make sure it doesn't read past the current replay
|
2020-05-12 07:43:57 +02:00
|
|
|
* position (using GetXLogReplayRecPtr) if executing in recovery, so it
|
Remove all use of ThisTimeLineID global variable outside of xlog.c
All such code deals with this global variable in one of three ways.
Sometimes the same functions use it in more than one of these ways
at the same time.
First, sometimes it's an implicit argument to one or more functions
being called in xlog.c or elsewhere, and must be set to the
appropriate value before calling those functions lest they
misbehave. In those cases, it is now passed as an explicit argument
instead.
Second, sometimes it's used to obtain the current timeline after
the end of recovery, i.e. the timeline to which WAL is being
written and flushed. Such code now calls GetWALInsertionTimeLine()
or relies on the new out parameter added to GetFlushRecPtr().
Third, sometimes it's used during recovery to store the current
replay timeline. That can change, so such code must generally
update the value before each use. It can still do that, but must
now use a local variable instead.
The net effect of these changes is to reduce by a fair amount the
amount of code that is directly accessing this global variable.
That's good, because history has shown that we don't always think
clearly about which timeline ID it's supposed to contain at any
given point in time, or indeed, whether it has been or needs to
be initialized at any given point in the code.
Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and
Álvaro Herrera.
Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
|
|
|
* doesn't fail to notice that the current timeline became historical.
|
2017-03-22 08:05:12 +01:00
|
|
|
*/
|
|
|
|
void
|
Remove all use of ThisTimeLineID global variable outside of xlog.c
All such code deals with this global variable in one of three ways.
Sometimes the same functions use it in more than one of these ways
at the same time.
First, sometimes it's an implicit argument to one or more functions
being called in xlog.c or elsewhere, and must be set to the
appropriate value before calling those functions lest they
misbehave. In those cases, it is now passed as an explicit argument
instead.
Second, sometimes it's used to obtain the current timeline after
the end of recovery, i.e. the timeline to which WAL is being
written and flushed. Such code now calls GetWALInsertionTimeLine()
or relies on the new out parameter added to GetFlushRecPtr().
Third, sometimes it's used during recovery to store the current
replay timeline. That can change, so such code must generally
update the value before each use. It can still do that, but must
now use a local variable instead.
The net effect of these changes is to reduce by a fair amount the
amount of code that is directly accessing this global variable.
That's good, because history has shown that we don't always think
clearly about which timeline ID it's supposed to contain at any
given point in time, or indeed, whether it has been or needs to
be initialized at any given point in the code.
Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and
Álvaro Herrera.
Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
|
|
|
XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage,
|
|
|
|
uint32 wantLength, TimeLineID currTLI)
|
2017-03-22 08:05:12 +01:00
|
|
|
{
|
2021-05-10 06:00:53 +02:00
|
|
|
const XLogRecPtr lastReadPage = (state->seg.ws_segno *
|
|
|
|
state->segcxt.ws_segsize + state->segoff);
|
2017-03-22 08:05:12 +01:00
|
|
|
|
|
|
|
Assert(wantPage != InvalidXLogRecPtr && wantPage % XLOG_BLCKSZ == 0);
|
|
|
|
Assert(wantLength <= XLOG_BLCKSZ);
|
|
|
|
Assert(state->readLen == 0 || state->readLen <= XLOG_BLCKSZ);
|
Remove all use of ThisTimeLineID global variable outside of xlog.c
All such code deals with this global variable in one of three ways.
Sometimes the same functions use it in more than one of these ways
at the same time.
First, sometimes it's an implicit argument to one or more functions
being called in xlog.c or elsewhere, and must be set to the
appropriate value before calling those functions lest they
misbehave. In those cases, it is now passed as an explicit argument
instead.
Second, sometimes it's used to obtain the current timeline after
the end of recovery, i.e. the timeline to which WAL is being
written and flushed. Such code now calls GetWALInsertionTimeLine()
or relies on the new out parameter added to GetFlushRecPtr().
Third, sometimes it's used during recovery to store the current
replay timeline. That can change, so such code must generally
update the value before each use. It can still do that, but must
now use a local variable instead.
The net effect of these changes is to reduce by a fair amount the
amount of code that is directly accessing this global variable.
That's good, because history has shown that we don't always think
clearly about which timeline ID it's supposed to contain at any
given point in time, or indeed, whether it has been or needs to
be initialized at any given point in the code.
Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and
Álvaro Herrera.
Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
|
|
|
Assert(currTLI != 0);
|
2017-03-22 08:05:12 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the desired page is currently read in and valid, we have nothing to
|
|
|
|
* do.
|
|
|
|
*
|
|
|
|
* The caller should've ensured that it didn't previously advance readOff
|
|
|
|
* past the valid limit of this timeline, so it doesn't matter if the
|
|
|
|
* current TLI has since become historical.
|
|
|
|
*/
|
|
|
|
if (lastReadPage == wantPage &&
|
2021-05-10 06:00:53 +02:00
|
|
|
state->readLen != 0 &&
|
2017-03-22 08:05:12 +01:00
|
|
|
lastReadPage + state->readLen >= wantPage + Min(wantLength, XLOG_BLCKSZ - 1))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're reading from the current timeline, it hasn't become historical
|
|
|
|
* and the page we're reading is after the last page read, we can again
|
|
|
|
* just carry on. (Seeking backwards requires a check to make sure the
|
|
|
|
* older page isn't on a prior timeline).
|
|
|
|
*
|
Remove all use of ThisTimeLineID global variable outside of xlog.c
All such code deals with this global variable in one of three ways.
Sometimes the same functions use it in more than one of these ways
at the same time.
First, sometimes it's an implicit argument to one or more functions
being called in xlog.c or elsewhere, and must be set to the
appropriate value before calling those functions lest they
misbehave. In those cases, it is now passed as an explicit argument
instead.
Second, sometimes it's used to obtain the current timeline after
the end of recovery, i.e. the timeline to which WAL is being
written and flushed. Such code now calls GetWALInsertionTimeLine()
or relies on the new out parameter added to GetFlushRecPtr().
Third, sometimes it's used during recovery to store the current
replay timeline. That can change, so such code must generally
update the value before each use. It can still do that, but must
now use a local variable instead.
The net effect of these changes is to reduce by a fair amount the
amount of code that is directly accessing this global variable.
That's good, because history has shown that we don't always think
clearly about which timeline ID it's supposed to contain at any
given point in time, or indeed, whether it has been or needs to
be initialized at any given point in the code.
Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and
Álvaro Herrera.
Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
|
|
|
* currTLI might've become historical since the caller obtained the value,
|
|
|
|
* but the caller is required not to read past the flush limit it saw at
|
|
|
|
* the time it looked up the timeline. There's nothing we can do about it
|
|
|
|
* if StartupXLOG() renames it to .partial concurrently.
|
2017-03-22 08:05:12 +01:00
|
|
|
*/
|
Remove all use of ThisTimeLineID global variable outside of xlog.c
All such code deals with this global variable in one of three ways.
Sometimes the same functions use it in more than one of these ways
at the same time.
First, sometimes it's an implicit argument to one or more functions
being called in xlog.c or elsewhere, and must be set to the
appropriate value before calling those functions lest they
misbehave. In those cases, it is now passed as an explicit argument
instead.
Second, sometimes it's used to obtain the current timeline after
the end of recovery, i.e. the timeline to which WAL is being
written and flushed. Such code now calls GetWALInsertionTimeLine()
or relies on the new out parameter added to GetFlushRecPtr().
Third, sometimes it's used during recovery to store the current
replay timeline. That can change, so such code must generally
update the value before each use. It can still do that, but must
now use a local variable instead.
The net effect of these changes is to reduce by a fair amount the
amount of code that is directly accessing this global variable.
That's good, because history has shown that we don't always think
clearly about which timeline ID it's supposed to contain at any
given point in time, or indeed, whether it has been or needs to
be initialized at any given point in the code.
Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and
Álvaro Herrera.
Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
|
|
|
if (state->currTLI == currTLI && wantPage >= lastReadPage)
|
2017-03-22 08:05:12 +01:00
|
|
|
{
|
|
|
|
Assert(state->currTLIValidUntil == InvalidXLogRecPtr);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're just reading pages from a previously validated historical
|
|
|
|
* timeline and the timeline we're reading from is valid until the end of
|
|
|
|
* the current segment we can just keep reading.
|
|
|
|
*/
|
|
|
|
if (state->currTLIValidUntil != InvalidXLogRecPtr &&
|
Remove all use of ThisTimeLineID global variable outside of xlog.c
All such code deals with this global variable in one of three ways.
Sometimes the same functions use it in more than one of these ways
at the same time.
First, sometimes it's an implicit argument to one or more functions
being called in xlog.c or elsewhere, and must be set to the
appropriate value before calling those functions lest they
misbehave. In those cases, it is now passed as an explicit argument
instead.
Second, sometimes it's used to obtain the current timeline after
the end of recovery, i.e. the timeline to which WAL is being
written and flushed. Such code now calls GetWALInsertionTimeLine()
or relies on the new out parameter added to GetFlushRecPtr().
Third, sometimes it's used during recovery to store the current
replay timeline. That can change, so such code must generally
update the value before each use. It can still do that, but must
now use a local variable instead.
The net effect of these changes is to reduce by a fair amount the
amount of code that is directly accessing this global variable.
That's good, because history has shown that we don't always think
clearly about which timeline ID it's supposed to contain at any
given point in time, or indeed, whether it has been or needs to
be initialized at any given point in the code.
Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and
Álvaro Herrera.
Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
|
|
|
state->currTLI != currTLI &&
|
2017-03-22 08:05:12 +01:00
|
|
|
state->currTLI != 0 &&
|
2019-09-24 21:08:31 +02:00
|
|
|
((wantPage + wantLength) / state->segcxt.ws_segsize) <
|
|
|
|
(state->currTLIValidUntil / state->segcxt.ws_segsize))
|
2017-03-22 08:05:12 +01:00
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we reach this point we're either looking up a page for random
|
|
|
|
* access, the current timeline just became historical, or we're reading
|
|
|
|
* from a new segment containing a timeline switch. In all cases we need
|
|
|
|
* to determine the newest timeline on the segment.
|
|
|
|
*
|
|
|
|
* If it's the current timeline we can just keep reading from here unless
|
|
|
|
* we detect a timeline switch that makes the current timeline historical.
|
|
|
|
* If it's a historical timeline we can read all the segment on the newest
|
|
|
|
* timeline because it contains all the old timelines' data too. So only
|
|
|
|
* one switch check is required.
|
|
|
|
*/
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We need to re-read the timeline history in case it's been changed
|
|
|
|
* by a promotion or replay from a cascaded replica.
|
|
|
|
*/
|
Remove all use of ThisTimeLineID global variable outside of xlog.c
All such code deals with this global variable in one of three ways.
Sometimes the same functions use it in more than one of these ways
at the same time.
First, sometimes it's an implicit argument to one or more functions
being called in xlog.c or elsewhere, and must be set to the
appropriate value before calling those functions lest they
misbehave. In those cases, it is now passed as an explicit argument
instead.
Second, sometimes it's used to obtain the current timeline after
the end of recovery, i.e. the timeline to which WAL is being
written and flushed. Such code now calls GetWALInsertionTimeLine()
or relies on the new out parameter added to GetFlushRecPtr().
Third, sometimes it's used during recovery to store the current
replay timeline. That can change, so such code must generally
update the value before each use. It can still do that, but must
now use a local variable instead.
The net effect of these changes is to reduce by a fair amount the
amount of code that is directly accessing this global variable.
That's good, because history has shown that we don't always think
clearly about which timeline ID it's supposed to contain at any
given point in time, or indeed, whether it has been or needs to
be initialized at any given point in the code.
Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and
Álvaro Herrera.
Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
|
|
|
List *timelineHistory = readTimeLineHistory(currTLI);
|
2019-09-24 21:08:31 +02:00
|
|
|
XLogRecPtr endOfSegment;
|
2017-03-22 08:05:12 +01:00
|
|
|
|
2019-09-24 21:08:31 +02:00
|
|
|
endOfSegment = ((wantPage / state->segcxt.ws_segsize) + 1) *
|
|
|
|
state->segcxt.ws_segsize - 1;
|
|
|
|
Assert(wantPage / state->segcxt.ws_segsize ==
|
|
|
|
endOfSegment / state->segcxt.ws_segsize);
|
2017-03-22 08:05:12 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the timeline of the last LSN on the segment containing
|
|
|
|
* wantPage.
|
|
|
|
*/
|
|
|
|
state->currTLI = tliOfPointInHistory(endOfSegment, timelineHistory);
|
|
|
|
state->currTLIValidUntil = tliSwitchPoint(state->currTLI, timelineHistory,
|
|
|
|
&state->nextTLI);
|
|
|
|
|
|
|
|
Assert(state->currTLIValidUntil == InvalidXLogRecPtr ||
|
|
|
|
wantPage + wantLength < state->currTLIValidUntil);
|
|
|
|
|
|
|
|
list_free_deep(timelineHistory);
|
|
|
|
|
|
|
|
elog(DEBUG3, "switched to timeline %u valid until %X/%X",
|
|
|
|
state->currTLI,
|
2021-02-23 10:14:38 +01:00
|
|
|
LSN_FORMAT_ARGS(state->currTLIValidUntil));
|
2017-03-22 08:05:12 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-08 21:30:34 +02:00
|
|
|
/* XLogReaderRoutine->segment_open callback for local pg_wal files */
|
2020-05-13 18:17:08 +02:00
|
|
|
void
|
2020-05-08 21:30:34 +02:00
|
|
|
wal_segment_open(XLogReaderState *state, XLogSegNo nextSegNo,
|
2020-05-13 18:17:08 +02:00
|
|
|
TimeLineID *tli_p)
|
2019-11-25 19:04:54 +01:00
|
|
|
{
|
|
|
|
TimeLineID tli = *tli_p;
|
|
|
|
char path[MAXPGPATH];
|
|
|
|
|
2020-05-13 18:17:08 +02:00
|
|
|
XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
|
|
|
|
state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
|
|
|
|
if (state->seg.ws_file >= 0)
|
|
|
|
return;
|
2019-11-25 19:04:54 +01:00
|
|
|
|
|
|
|
if (errno == ENOENT)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode_for_file_access(),
|
|
|
|
errmsg("requested WAL segment %s has already been removed",
|
|
|
|
path)));
|
|
|
|
else
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode_for_file_access(),
|
|
|
|
errmsg("could not open file \"%s\": %m",
|
|
|
|
path)));
|
|
|
|
}
|
|
|
|
|
2020-05-08 21:30:34 +02:00
|
|
|
/* stock XLogReaderRoutine->segment_close callback */
|
|
|
|
void
|
|
|
|
wal_segment_close(XLogReaderState *state)
|
|
|
|
{
|
|
|
|
close(state->seg.ws_file);
|
|
|
|
/* need to check errno? */
|
|
|
|
state->seg.ws_file = -1;
|
|
|
|
}
|
|
|
|
|
2016-01-21 02:18:58 +01:00
|
|
|
/*
|
2020-05-08 21:30:34 +02:00
|
|
|
* XLogReaderRoutine->page_read callback for reading local xlog files
|
2016-01-21 02:18:58 +01:00
|
|
|
*
|
|
|
|
* Public because it would likely be very helpful for someone writing another
|
|
|
|
* output method outside walsender, e.g. in a bgworker.
|
|
|
|
*
|
2016-03-30 23:56:13 +02:00
|
|
|
* TODO: The walsender has its own version of this, but it relies on the
|
2016-01-21 02:18:58 +01:00
|
|
|
* walsender's latch being set whenever WAL is flushed. No such infrastructure
|
|
|
|
* exists for normal backends, so we have to do a check/sleep/repeat style of
|
|
|
|
* loop for now.
|
|
|
|
*/
|
2021-05-10 06:00:53 +02:00
|
|
|
int
|
|
|
|
read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr,
|
|
|
|
int reqLen, XLogRecPtr targetRecPtr, char *cur_page)
|
2022-04-08 09:02:10 +02:00
|
|
|
{
|
|
|
|
return read_local_xlog_page_guts(state, targetPagePtr, reqLen,
|
|
|
|
targetRecPtr, cur_page, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Same as read_local_xlog_page except that it doesn't wait for future WAL
|
|
|
|
* to be available.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
read_local_xlog_page_no_wait(XLogReaderState *state, XLogRecPtr targetPagePtr,
|
|
|
|
int reqLen, XLogRecPtr targetRecPtr,
|
|
|
|
char *cur_page)
|
|
|
|
{
|
|
|
|
return read_local_xlog_page_guts(state, targetPagePtr, reqLen,
|
|
|
|
targetRecPtr, cur_page, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Implementation of read_local_xlog_page and its no wait version.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
read_local_xlog_page_guts(XLogReaderState *state, XLogRecPtr targetPagePtr,
|
|
|
|
int reqLen, XLogRecPtr targetRecPtr,
|
|
|
|
char *cur_page, bool wait_for_wal)
|
2016-01-21 02:18:58 +01:00
|
|
|
{
|
2016-03-31 01:07:05 +02:00
|
|
|
XLogRecPtr read_upto,
|
2016-01-21 02:18:58 +01:00
|
|
|
loc;
|
2019-11-25 19:04:54 +01:00
|
|
|
TimeLineID tli;
|
2016-01-21 02:18:58 +01:00
|
|
|
int count;
|
2019-11-25 19:04:54 +01:00
|
|
|
WALReadError errinfo;
|
Remove all use of ThisTimeLineID global variable outside of xlog.c
All such code deals with this global variable in one of three ways.
Sometimes the same functions use it in more than one of these ways
at the same time.
First, sometimes it's an implicit argument to one or more functions
being called in xlog.c or elsewhere, and must be set to the
appropriate value before calling those functions lest they
misbehave. In those cases, it is now passed as an explicit argument
instead.
Second, sometimes it's used to obtain the current timeline after
the end of recovery, i.e. the timeline to which WAL is being
written and flushed. Such code now calls GetWALInsertionTimeLine()
or relies on the new out parameter added to GetFlushRecPtr().
Third, sometimes it's used during recovery to store the current
replay timeline. That can change, so such code must generally
update the value before each use. It can still do that, but must
now use a local variable instead.
The net effect of these changes is to reduce by a fair amount the
amount of code that is directly accessing this global variable.
That's good, because history has shown that we don't always think
clearly about which timeline ID it's supposed to contain at any
given point in time, or indeed, whether it has been or needs to
be initialized at any given point in the code.
Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and
Álvaro Herrera.
Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
|
|
|
TimeLineID currTLI;
|
2016-01-21 02:18:58 +01:00
|
|
|
|
|
|
|
loc = targetPagePtr + reqLen;
|
2017-03-22 08:05:12 +01:00
|
|
|
|
|
|
|
/* Loop waiting for xlog to be available if necessary */
|
2016-01-21 02:18:58 +01:00
|
|
|
while (1)
|
|
|
|
{
|
|
|
|
/*
|
2017-03-22 08:05:12 +01:00
|
|
|
* Determine the limit of xlog we can currently read to, and what the
|
|
|
|
* most recent timeline is.
|
2016-01-21 02:18:58 +01:00
|
|
|
*/
|
2016-05-04 22:32:22 +02:00
|
|
|
if (!RecoveryInProgress())
|
Remove all use of ThisTimeLineID global variable outside of xlog.c
All such code deals with this global variable in one of three ways.
Sometimes the same functions use it in more than one of these ways
at the same time.
First, sometimes it's an implicit argument to one or more functions
being called in xlog.c or elsewhere, and must be set to the
appropriate value before calling those functions lest they
misbehave. In those cases, it is now passed as an explicit argument
instead.
Second, sometimes it's used to obtain the current timeline after
the end of recovery, i.e. the timeline to which WAL is being
written and flushed. Such code now calls GetWALInsertionTimeLine()
or relies on the new out parameter added to GetFlushRecPtr().
Third, sometimes it's used during recovery to store the current
replay timeline. That can change, so such code must generally
update the value before each use. It can still do that, but must
now use a local variable instead.
The net effect of these changes is to reduce by a fair amount the
amount of code that is directly accessing this global variable.
That's good, because history has shown that we don't always think
clearly about which timeline ID it's supposed to contain at any
given point in time, or indeed, whether it has been or needs to
be initialized at any given point in the code.
Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and
Álvaro Herrera.
Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
|
|
|
read_upto = GetFlushRecPtr(&currTLI);
|
2016-01-21 02:18:58 +01:00
|
|
|
else
|
Remove all use of ThisTimeLineID global variable outside of xlog.c
All such code deals with this global variable in one of three ways.
Sometimes the same functions use it in more than one of these ways
at the same time.
First, sometimes it's an implicit argument to one or more functions
being called in xlog.c or elsewhere, and must be set to the
appropriate value before calling those functions lest they
misbehave. In those cases, it is now passed as an explicit argument
instead.
Second, sometimes it's used to obtain the current timeline after
the end of recovery, i.e. the timeline to which WAL is being
written and flushed. Such code now calls GetWALInsertionTimeLine()
or relies on the new out parameter added to GetFlushRecPtr().
Third, sometimes it's used during recovery to store the current
replay timeline. That can change, so such code must generally
update the value before each use. It can still do that, but must
now use a local variable instead.
The net effect of these changes is to reduce by a fair amount the
amount of code that is directly accessing this global variable.
That's good, because history has shown that we don't always think
clearly about which timeline ID it's supposed to contain at any
given point in time, or indeed, whether it has been or needs to
be initialized at any given point in the code.
Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and
Álvaro Herrera.
Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
|
|
|
read_upto = GetXLogReplayRecPtr(&currTLI);
|
|
|
|
tli = currTLI;
|
2017-03-22 08:05:12 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Check which timeline to get the record from.
|
|
|
|
*
|
|
|
|
* We have to do it each time through the loop because if we're in
|
|
|
|
* recovery as a cascading standby, the current timeline might've
|
|
|
|
* become historical. We can't rely on RecoveryInProgress() because in
|
|
|
|
* a standby configuration like
|
|
|
|
*
|
|
|
|
* A => B => C
|
|
|
|
*
|
|
|
|
* if we're a logical decoding session on C, and B gets promoted, our
|
|
|
|
* timeline will change while we remain in recovery.
|
|
|
|
*
|
|
|
|
* We can't just keep reading from the old timeline as the last WAL
|
|
|
|
* archive in the timeline will get renamed to .partial by
|
|
|
|
* StartupXLOG().
|
|
|
|
*
|
Remove all use of ThisTimeLineID global variable outside of xlog.c
All such code deals with this global variable in one of three ways.
Sometimes the same functions use it in more than one of these ways
at the same time.
First, sometimes it's an implicit argument to one or more functions
being called in xlog.c or elsewhere, and must be set to the
appropriate value before calling those functions lest they
misbehave. In those cases, it is now passed as an explicit argument
instead.
Second, sometimes it's used to obtain the current timeline after
the end of recovery, i.e. the timeline to which WAL is being
written and flushed. Such code now calls GetWALInsertionTimeLine()
or relies on the new out parameter added to GetFlushRecPtr().
Third, sometimes it's used during recovery to store the current
replay timeline. That can change, so such code must generally
update the value before each use. It can still do that, but must
now use a local variable instead.
The net effect of these changes is to reduce by a fair amount the
amount of code that is directly accessing this global variable.
That's good, because history has shown that we don't always think
clearly about which timeline ID it's supposed to contain at any
given point in time, or indeed, whether it has been or needs to
be initialized at any given point in the code.
Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and
Álvaro Herrera.
Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
|
|
|
* If that happens after our caller determined the TLI but before we
|
2017-03-22 08:05:12 +01:00
|
|
|
* actually read the xlog page, we might still try to read from the
|
|
|
|
* old (now renamed) segment and fail. There's not much we can do
|
|
|
|
* about this, but it can only happen when we're a leaf of a cascading
|
2020-06-14 23:05:18 +02:00
|
|
|
* standby whose primary gets promoted while we're decoding, so a
|
2017-03-22 08:05:12 +01:00
|
|
|
* one-off ERROR isn't too bad.
|
|
|
|
*/
|
Remove all use of ThisTimeLineID global variable outside of xlog.c
All such code deals with this global variable in one of three ways.
Sometimes the same functions use it in more than one of these ways
at the same time.
First, sometimes it's an implicit argument to one or more functions
being called in xlog.c or elsewhere, and must be set to the
appropriate value before calling those functions lest they
misbehave. In those cases, it is now passed as an explicit argument
instead.
Second, sometimes it's used to obtain the current timeline after
the end of recovery, i.e. the timeline to which WAL is being
written and flushed. Such code now calls GetWALInsertionTimeLine()
or relies on the new out parameter added to GetFlushRecPtr().
Third, sometimes it's used during recovery to store the current
replay timeline. That can change, so such code must generally
update the value before each use. It can still do that, but must
now use a local variable instead.
The net effect of these changes is to reduce by a fair amount the
amount of code that is directly accessing this global variable.
That's good, because history has shown that we don't always think
clearly about which timeline ID it's supposed to contain at any
given point in time, or indeed, whether it has been or needs to
be initialized at any given point in the code.
Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and
Álvaro Herrera.
Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
|
|
|
XLogReadDetermineTimeline(state, targetPagePtr, reqLen, tli);
|
2017-03-22 08:05:12 +01:00
|
|
|
|
Remove all use of ThisTimeLineID global variable outside of xlog.c
All such code deals with this global variable in one of three ways.
Sometimes the same functions use it in more than one of these ways
at the same time.
First, sometimes it's an implicit argument to one or more functions
being called in xlog.c or elsewhere, and must be set to the
appropriate value before calling those functions lest they
misbehave. In those cases, it is now passed as an explicit argument
instead.
Second, sometimes it's used to obtain the current timeline after
the end of recovery, i.e. the timeline to which WAL is being
written and flushed. Such code now calls GetWALInsertionTimeLine()
or relies on the new out parameter added to GetFlushRecPtr().
Third, sometimes it's used during recovery to store the current
replay timeline. That can change, so such code must generally
update the value before each use. It can still do that, but must
now use a local variable instead.
The net effect of these changes is to reduce by a fair amount the
amount of code that is directly accessing this global variable.
That's good, because history has shown that we don't always think
clearly about which timeline ID it's supposed to contain at any
given point in time, or indeed, whether it has been or needs to
be initialized at any given point in the code.
Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and
Álvaro Herrera.
Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
|
|
|
if (state->currTLI == currTLI)
|
2017-03-22 08:05:12 +01:00
|
|
|
{
|
2016-05-04 22:32:22 +02:00
|
|
|
|
2017-03-22 08:05:12 +01:00
|
|
|
if (loc <= read_upto)
|
|
|
|
break;
|
|
|
|
|
2022-04-08 09:02:10 +02:00
|
|
|
/* If asked, let's not wait for future WAL. */
|
|
|
|
if (!wait_for_wal)
|
2022-04-30 17:28:33 +02:00
|
|
|
{
|
|
|
|
ReadLocalXLogPageNoWaitPrivate *private_data;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Inform the caller of read_local_xlog_page_no_wait that the
|
|
|
|
* end of WAL has been reached.
|
|
|
|
*/
|
|
|
|
private_data = (ReadLocalXLogPageNoWaitPrivate *)
|
|
|
|
state->private_data;
|
|
|
|
private_data->end_of_wal = true;
|
2022-04-08 09:02:10 +02:00
|
|
|
break;
|
2022-04-30 17:28:33 +02:00
|
|
|
}
|
2022-04-08 09:02:10 +02:00
|
|
|
|
2017-03-22 08:05:12 +01:00
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
pg_usleep(1000L);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We're on a historical timeline, so limit reading to the switch
|
|
|
|
* point where we moved to the next timeline.
|
|
|
|
*
|
|
|
|
* We don't need to GetFlushRecPtr or GetXLogReplayRecPtr. We know
|
|
|
|
* about the new timeline, so we must've received past the end of
|
|
|
|
* it.
|
|
|
|
*/
|
|
|
|
read_upto = state->currTLIValidUntil;
|
|
|
|
|
|
|
|
/*
|
2019-11-25 19:04:54 +01:00
|
|
|
* Setting tli to our wanted record's TLI is slightly wrong; the
|
|
|
|
* page might begin on an older timeline if it contains a timeline
|
|
|
|
* switch, since its xlog segment will have been copied from the
|
|
|
|
* prior timeline. This is pretty harmless though, as nothing
|
|
|
|
* cares so long as the timeline doesn't go backwards. We should
|
|
|
|
* read the page header instead; FIXME someday.
|
2017-03-22 08:05:12 +01:00
|
|
|
*/
|
2019-11-25 19:04:54 +01:00
|
|
|
tli = state->currTLI;
|
2017-03-22 08:05:12 +01:00
|
|
|
|
|
|
|
/* No need to wait on a historical timeline */
|
|
|
|
break;
|
|
|
|
}
|
2016-01-21 02:18:58 +01:00
|
|
|
}
|
|
|
|
|
2016-03-31 01:07:05 +02:00
|
|
|
if (targetPagePtr + XLOG_BLCKSZ <= read_upto)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* more than one block available; read only that block, have caller
|
|
|
|
* come back if they need more.
|
|
|
|
*/
|
2016-01-21 02:18:58 +01:00
|
|
|
count = XLOG_BLCKSZ;
|
2016-03-31 01:07:05 +02:00
|
|
|
}
|
|
|
|
else if (targetPagePtr + reqLen > read_upto)
|
|
|
|
{
|
|
|
|
/* not enough data there */
|
2021-05-10 06:00:53 +02:00
|
|
|
return -1;
|
2016-03-31 01:07:05 +02:00
|
|
|
}
|
2016-01-21 02:18:58 +01:00
|
|
|
else
|
2016-03-31 01:07:05 +02:00
|
|
|
{
|
|
|
|
/* enough bytes available to satisfy the request */
|
|
|
|
count = read_upto - targetPagePtr;
|
|
|
|
}
|
2016-01-21 02:18:58 +01:00
|
|
|
|
2016-03-31 01:07:05 +02:00
|
|
|
/*
|
|
|
|
* Even though we just determined how much of the page can be validly read
|
|
|
|
* as 'count', read the whole page anyway. It's guaranteed to be
|
|
|
|
* zero-padded up to the page boundary if it's incomplete.
|
|
|
|
*/
|
2021-05-10 06:00:53 +02:00
|
|
|
if (!WALRead(state, cur_page, targetPagePtr, XLOG_BLCKSZ, tli,
|
|
|
|
&errinfo))
|
2019-11-25 19:04:54 +01:00
|
|
|
WALReadRaiseError(&errinfo);
|
2016-01-21 02:18:58 +01:00
|
|
|
|
2016-03-31 01:07:05 +02:00
|
|
|
/* number of valid bytes in the buffer */
|
2021-05-10 06:00:53 +02:00
|
|
|
return count;
|
2016-01-21 02:18:58 +01:00
|
|
|
}
|
2019-11-25 19:04:54 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Backend-specific convenience code to handle read errors encountered by
|
|
|
|
* WALRead().
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
WALReadRaiseError(WALReadError *errinfo)
|
|
|
|
{
|
|
|
|
WALOpenSegment *seg = &errinfo->wre_seg;
|
2019-12-03 07:06:04 +01:00
|
|
|
char fname[MAXFNAMELEN];
|
|
|
|
|
|
|
|
XLogFileName(fname, seg->ws_tli, seg->ws_segno, wal_segment_size);
|
2019-11-25 19:04:54 +01:00
|
|
|
|
|
|
|
if (errinfo->wre_read < 0)
|
|
|
|
{
|
|
|
|
errno = errinfo->wre_errno;
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode_for_file_access(),
|
2021-12-29 10:08:41 +01:00
|
|
|
errmsg("could not read from log segment %s, offset %d: %m",
|
2019-11-25 19:04:54 +01:00
|
|
|
fname, errinfo->wre_off)));
|
|
|
|
}
|
|
|
|
else if (errinfo->wre_read == 0)
|
|
|
|
{
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_DATA_CORRUPTED),
|
2021-12-29 10:08:41 +01:00
|
|
|
errmsg("could not read from log segment %s, offset %d: read %d of %d",
|
2019-11-25 19:04:54 +01:00
|
|
|
fname, errinfo->wre_off, errinfo->wre_read,
|
2021-08-08 22:05:42 +02:00
|
|
|
errinfo->wre_req)));
|
2019-11-25 19:04:54 +01:00
|
|
|
}
|
|
|
|
}
|