postgresql/src/backend/access/transam/xlog.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

9241 lines
294 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* xlog.c
* PostgreSQL write-ahead log manager
*
* The Write-Ahead Log (WAL) functionality is split into several source
* files, in addition to this one:
*
* xloginsert.c - Functions for constructing WAL records
* xlogrecovery.c - WAL recovery and standby code
* xlogreader.c - Facility for reading WAL files and parsing WAL records
* xlogutils.c - Helper functions for WAL redo routines
*
* This file contains functions for coordinating database startup and
* checkpointing, and managing the write-ahead log buffers when the
* system is running.
*
* StartupXLOG() is the main entry point of the startup process. It
* coordinates database startup, performing WAL recovery, and the
* transition from WAL recovery into normal operations.
*
* XLogInsertRecord() inserts a WAL record into the WAL buffers. Most
* callers should not call this directly, but use the functions in
* xloginsert.c to construct the WAL record. XLogFlush() can be used
* to force the WAL to disk.
*
* In addition to those, there are many other functions for interrogating
* the current system state, and for starting/stopping backups.
*
*
* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
2010-09-20 22:08:53 +02:00
* src/backend/access/transam/xlog.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <ctype.h>
#include <math.h>
#include <time.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <unistd.h>
#include "access/clog.h"
#include "access/commit_ts.h"
#include "access/heaptoast.h"
#include "access/multixact.h"
#include "access/rewriteheap.h"
#include "access/subtrans.h"
#include "access/timeline.h"
#include "access/transam.h"
#include "access/twophase.h"
#include "access/xact.h"
#include "access/xlog_internal.h"
#include "access/xlogarchive.h"
#include "access/xloginsert.h"
#include "access/xlogreader.h"
#include "access/xlogrecovery.h"
#include "access/xlogutils.h"
#include "catalog/catversion.h"
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
#include "catalog/pg_control.h"
#include "catalog/pg_database.h"
#include "common/controldata_utils.h"
#include "common/file_utils.h"
#include "executor/instrument.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "port/atomics.h"
#include "port/pg_iovec.h"
#include "postmaster/bgwriter.h"
#include "postmaster/startup.h"
#include "postmaster/walwriter.h"
#include "replication/basebackup.h"
#include "replication/logical.h"
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
#include "replication/origin.h"
#include "replication/slot.h"
#include "replication/snapbuild.h"
#include "replication/walreceiver.h"
#include "replication/walsender.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/large_object.h"
#include "storage/latch.h"
#include "storage/pmsignal.h"
#include "storage/predicate.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "storage/reinit.h"
#include "storage/smgr.h"
#include "storage/spin.h"
#include "storage/sync.h"
#include "utils/guc.h"
#include "utils/memutils.h"
#include "utils/ps_status.h"
#include "utils/relmapper.h"
#include "utils/pg_rusage.h"
#include "utils/snapmgr.h"
#include "utils/timeout.h"
#include "utils/timestamp.h"
extern uint32 bootstrap_data_checksum_version;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
/* timeline ID to be used when bootstrapping */
#define BootstrapTimeLineID 1
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/* User-settable parameters */
int max_wal_size_mb = 1024; /* 1 GB */
int min_wal_size_mb = 80; /* 80 MB */
int wal_keep_size_mb = 0;
int XLOGbuffers = -1;
int XLogArchiveTimeout = 0;
int XLogArchiveMode = ARCHIVE_MODE_OFF;
char *XLogArchiveCommand = NULL;
bool EnableHotStandby = false;
bool fullPageWrites = true;
bool wal_log_hints = false;
int wal_compression = WAL_COMPRESSION_NONE;
char *wal_consistency_checking_string = NULL;
bool *wal_consistency_checking = NULL;
bool wal_init_zero = true;
bool wal_recycle = true;
bool log_checkpoints = true;
int sync_method = DEFAULT_SYNC_METHOD;
int wal_level = WAL_LEVEL_MINIMAL;
int CommitDelay = 0; /* precommit delay in microseconds */
int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
int wal_retrieve_retry_interval = 5000;
int max_slot_wal_keep_size_mb = -1;
bool track_wal_io_timing = false;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
#ifdef WAL_DEBUG
bool XLOG_DEBUG = false;
#endif
int wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
/*
* Number of WAL insertion locks to use. A higher value allows more insertions
* to happen concurrently, but adds some CPU overhead to flushing the WAL,
* which needs to iterate all the locks.
*/
#define NUM_XLOGINSERT_LOCKS 8
/*
* Max distance from last checkpoint, before triggering a new xlog-based
* checkpoint.
*/
int CheckPointSegments;
/* Estimated distance between checkpoints, in bytes */
static double CheckPointDistanceEstimate = 0;
static double PrevCheckPointDistance = 0;
2008-05-12 10:35:05 +02:00
/*
* GUC support
*/
const struct config_enum_entry sync_method_options[] = {
{"fsync", SYNC_METHOD_FSYNC, false},
2008-05-12 10:35:05 +02:00
#ifdef HAVE_FSYNC_WRITETHROUGH
{"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
2008-05-12 10:35:05 +02:00
#endif
#ifdef HAVE_FDATASYNC
{"fdatasync", SYNC_METHOD_FDATASYNC, false},
2008-05-12 10:35:05 +02:00
#endif
#ifdef OPEN_SYNC_FLAG
{"open_sync", SYNC_METHOD_OPEN, false},
2008-05-12 10:35:05 +02:00
#endif
#ifdef OPEN_DATASYNC_FLAG
{"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
2008-05-12 10:35:05 +02:00
#endif
{NULL, 0, false}
2008-05-12 10:35:05 +02:00
};
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Although only "on", "off", and "always" are documented,
* we accept all the likely variants of "on" and "off".
*/
const struct config_enum_entry archive_mode_options[] = {
{"always", ARCHIVE_MODE_ALWAYS, false},
{"on", ARCHIVE_MODE_ON, false},
{"off", ARCHIVE_MODE_OFF, false},
{"true", ARCHIVE_MODE_ON, true},
{"false", ARCHIVE_MODE_OFF, true},
{"yes", ARCHIVE_MODE_ON, true},
{"no", ARCHIVE_MODE_OFF, true},
{"1", ARCHIVE_MODE_ON, true},
{"0", ARCHIVE_MODE_OFF, true},
{NULL, 0, false}
};
/*
* Statistics for current checkpoint are collected in this global struct.
* Because only the checkpointer or a stand-alone backend can perform
* checkpoints, this will be unused in normal backends.
*/
CheckpointStatsData CheckpointStats;
/*
* During recovery, lastFullPageWrites keeps track of full_page_writes that
* the replayed WAL records indicate. It's initialized with full_page_writes
* that the recovery starting checkpoint record indicates, and then updated
* each time XLOG_FPW_CHANGE record is replayed.
*/
static bool lastFullPageWrites;
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/*
* Local copy of the state tracked by SharedRecoveryState in shared memory,
* It is false if SharedRecoveryState is RECOVERY_STATE_DONE. True actually
* means "not known, need to check the shared state".
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
*/
static bool LocalRecoveryInProgress = true;
2011-04-10 17:42:00 +02:00
/*
* Local state for XLogInsertAllowed():
* 1: unconditionally allowed to insert XLOG
* 0: unconditionally not allowed to insert XLOG
* -1: must check RecoveryInProgress(); disallow until it is false
* Most processes start with -1 and transition to 1 after seeing that recovery
* is not in progress. But we can also force the value for special cases.
* The coding in XLogInsertAllowed() depends on the first two of these states
* being numerically the same as bool true and false.
*/
static int LocalXLogInsertAllowed = -1;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* ProcLastRecPtr points to the start of the last XLOG record inserted by the
* current backend. It is updated for all inserts. XactLastRecEnd points to
* end+1 of the last record, and is reset when we end a top-level transaction,
* or start a new one; so it can be used to tell if the current transaction has
* created any XLOG records.
*
* While in parallel mode, this may not be fully up to date. When committing,
* a transaction can assume this covers all xlog records written either by the
* user backend or by any parallel worker which was present at any point during
* the transaction. But when aborting, or when still in parallel mode, other
* parallel backends may have written WAL records at later LSNs than the value
* stored here. The parallel leader advances its own copy, when necessary,
* in WaitForParallelWorkersToFinish.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr;
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
XLogRecPtr XactLastCommitEnd = InvalidXLogRecPtr;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* RedoRecPtr is this backend's local copy of the REDO record pointer
* (which is almost but not quite the same as a pointer to the most recent
* CHECKPOINT record). We update this from the shared-memory copy,
* XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
* hold an insertion lock). See XLogInsertRecord for details. We are also
* allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
* see GetRedoRecPtr.
*
* NB: Code that uses this variable must be prepared not only for the
* possibility that it may be arbitrarily out of date, but also for the
* possibility that it might be set to InvalidXLogRecPtr. We used to
* initialize it as a side effect of the first call to RecoveryInProgress(),
* which meant that most code that might use it could assume that it had a
* real if perhaps stale value. That's no longer the case.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
static XLogRecPtr RedoRecPtr;
/*
* doPageWrites is this backend's local copy of (forcePageWrites ||
* fullPageWrites). It is used together with RedoRecPtr to decide whether
* a full-page image of a page need to be taken.
*
* NB: Initially this is false, and there's no guarantee that it will be
* initialized to any other value before it is first used. Any code that
* makes use of it must recheck the value after obtaining a WALInsertLock,
* and respond appropriately if it turns out that the previous value wasn't
* accurate.
*/
static bool doPageWrites;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*----------
* Shared-memory data structures for XLOG control
*
* LogwrtRqst indicates a byte position that we need to write and/or fsync
* the log up to (all records before that point must be written or fsynced).
* LogwrtResult indicates the byte positions we have already written/fsynced.
* These structs are identical but are declared separately to indicate their
* slightly different functions.
*
* To read XLogCtl->LogwrtResult, you must hold either info_lck or
* WALWriteLock. To update it, you need to hold both locks. The point of
* this arrangement is that the value can be examined by code that already
* holds WALWriteLock without needing to grab info_lck as well. In addition
* to the shared variable, each backend has a private copy of LogwrtResult,
* which is updated when convenient.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*
* The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
* (protected by info_lck), but we don't need to cache any copies of it.
*
* info_lck is only held long enough to read/update the protected variables,
* so it's a plain spinlock. The other locks are held longer (potentially
* over I/O operations), so we use LWLocks for them. These locks are:
*
* WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
* It is only held while initializing and changing the mapping. If the
* contents of the buffer being replaced haven't been written yet, the mapping
* lock is released while the write is done, and reacquired afterwards.
*
* WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
* XLogFlush).
*
* ControlFileLock: must be held to read/update control file or create
* new log file.
*
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*----------
*/
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
typedef struct XLogwrtRqst
{
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
XLogRecPtr Write; /* last byte + 1 to write out */
XLogRecPtr Flush; /* last byte + 1 to flush */
} XLogwrtRqst;
typedef struct XLogwrtResult
{
XLogRecPtr Write; /* last byte + 1 written out */
XLogRecPtr Flush; /* last byte + 1 flushed */
} XLogwrtResult;
/*
* Inserting to WAL is protected by a small fixed number of WAL insertion
* locks. To insert to the WAL, you must hold one of the locks - it doesn't
* matter which one. To lock out other concurrent insertions, you must hold
* of them. Each WAL insertion lock consists of a lightweight lock, plus an
* indicator of how far the insertion has progressed (insertingAt).
*
* The insertingAt values are read when a process wants to flush WAL from
* the in-memory buffers to disk, to check that all the insertions to the
* region the process is about to write out have finished. You could simply
* wait for all currently in-progress insertions to finish, but the
* insertingAt indicator allows you to ignore insertions to later in the WAL,
* so that you only wait for the insertions that are modifying the buffers
* you're about to write out.
*
* This isn't just an optimization. If all the WAL buffers are dirty, an
* inserter that's holding a WAL insert lock might need to evict an old WAL
* buffer, which requires flushing the WAL. If it's possible for an inserter
* to block on another inserter unnecessarily, deadlock can arise when two
* inserters holding a WAL insert lock wait for each other to finish their
* insertion.
*
* Small WAL records that don't cross a page boundary never update the value,
* the WAL record is just copied to the page and the lock is released. But
* to avoid the deadlock-scenario explained above, the indicator is always
* updated before sleeping while holding an insertion lock.
*
* lastImportantAt contains the LSN of the last important WAL record inserted
* using a given lock. This value is used to detect if there has been
* important WAL activity since the last time some action, like a checkpoint,
* was performed - allowing to not repeat the action if not. The LSN is
* updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
* set. lastImportantAt is never cleared, only overwritten by the LSN of newer
* records. Tracking the WAL activity directly in WALInsertLock has the
* advantage of not needing any additional locks to update the value.
*/
typedef struct
{
LWLock lock;
XLogRecPtr insertingAt;
XLogRecPtr lastImportantAt;
} WALInsertLock;
/*
* All the WAL insertion locks are allocated as an array in shared memory. We
* force the array stride to be a power of 2, which saves a few cycles in
* indexing, but more importantly also ensures that individual slots don't
* cross cache line boundaries. (Of course, we have to also ensure that the
* array start address is suitably aligned.)
*/
typedef union WALInsertLockPadded
{
WALInsertLock l;
char pad[PG_CACHE_LINE_SIZE];
} WALInsertLockPadded;
/*
* State of an exclusive backup, necessary to control concurrent activities
* across sessions when working on exclusive backups.
*
* EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
* running, to be more precise pg_start_backup() is not being executed for
* an exclusive backup and there is no exclusive backup in progress.
* EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
* exclusive backup.
* EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
* running and an exclusive backup is in progress. pg_stop_backup() is
* needed to finish it.
* EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
* exclusive backup.
*/
typedef enum ExclusiveBackupState
{
EXCLUSIVE_BACKUP_NONE = 0,
EXCLUSIVE_BACKUP_STARTING,
EXCLUSIVE_BACKUP_IN_PROGRESS,
EXCLUSIVE_BACKUP_STOPPING
} ExclusiveBackupState;
/*
* Session status of running backup, used for sanity checks in SQL-callable
* functions to start and stop backups.
*/
static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Shared state data for WAL insertion.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
typedef struct XLogCtlInsert
{
slock_t insertpos_lck; /* protects CurrBytePos and PrevBytePos */
/*
* CurrBytePos is the end of reserved WAL. The next record will be
* inserted at that position. PrevBytePos is the start position of the
* previously inserted (or rather, reserved) record - it is copied to the
* prev-link of the next record. These are stored as "usable byte
* positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
*/
uint64 CurrBytePos;
uint64 PrevBytePos;
/*
* Make sure the above heavily-contended spinlock and byte positions are
* on their own cache line. In particular, the RedoRecPtr and full page
* write variables below should be on a different cache line. They are
* read on every WAL insertion, but updated rarely, and we don't want
* those reads to steal the cache line containing Curr/PrevBytePos.
*/
char pad[PG_CACHE_LINE_SIZE];
/*
* fullPageWrites is the authoritative value used by all backends to
* determine whether to write full-page image to WAL. This shared value,
* instead of the process-local fullPageWrites, is required because, when
* full_page_writes is changed by SIGHUP, we must WAL-log it before it
* actually affects WAL-logging by backends. Checkpointer sets at startup
* or after SIGHUP.
*
* To read these fields, you must hold an insertion lock. To modify them,
* you must hold ALL the locks.
*/
XLogRecPtr RedoRecPtr; /* current redo point for insertions */
bool forcePageWrites; /* forcing full-page writes for PITR? */
bool fullPageWrites;
/*
* exclusiveBackupState indicates the state of an exclusive backup (see
* comments of ExclusiveBackupState for more details). nonExclusiveBackups
* is a counter indicating the number of streaming base backups currently
* in progress. forcePageWrites is set to true when either of these is
* non-zero. lastBackupStart is the latest checkpoint redo location used
* as a starting point for an online backup.
*/
ExclusiveBackupState exclusiveBackupState;
int nonExclusiveBackups;
XLogRecPtr lastBackupStart;
/*
* WAL insertion locks.
*/
WALInsertLockPadded *WALInsertLocks;
} XLogCtlInsert;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Total shared-memory state for XLOG.
*/
typedef struct XLogCtlData
{
XLogCtlInsert Insert;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/* Protected by info_lck: */
XLogwrtRqst LogwrtRqst;
XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */
FullTransactionId ckptFullXid; /* nextXid of latest checkpoint */
XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */
XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */
/* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
XLogRecPtr unloggedLSN;
slock_t ulsn_lck;
/* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
pg_time_t lastSegSwitchTime;
XLogRecPtr lastSegSwitchLSN;
2001-03-22 05:01:46 +01:00
/*
* Protected by info_lck and WALWriteLock (you must hold either lock to
* read it, but both to update)
*/
XLogwrtResult LogwrtResult;
/*
* Latest initialized page in the cache (last byte position + 1).
*
* To change the identity of a buffer (and InitializedUpTo), you need to
* hold WALBufMappingLock. To change the identity of a buffer that's
* still dirty, the old page needs to be written out first, and for that
* you need WALWriteLock, and you need to ensure that there are no
* in-progress insertions to the page by calling
* WaitXLogInsertionsToFinish().
*/
XLogRecPtr InitializedUpTo;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* These values do not change after startup, although the pointed-to pages
* and xlblocks values certainly do. xlblocks values are protected by
* WALBufMappingLock.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
char *pages; /* buffers for unwritten XLOG pages */
XLogRecPtr *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
int XLogCacheBlck; /* highest allocated xlog buffer index */
/*
* InsertTimeLineID is the timeline into which new WAL is being inserted
* and flushed. It is zero during recovery, and does not change once set.
*
* If we create a new timeline when the system was started up,
* PrevTimeLineID is the old timeline's ID that we forked off from.
* Otherwise it's equal to InsertTimeLineID.
*/
TimeLineID InsertTimeLineID;
TimeLineID PrevTimeLineID;
2010-07-06 21:19:02 +02:00
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/*
* SharedRecoveryState indicates if we're still in crash or archive
* recovery. Protected by info_lck.
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
*/
RecoveryState SharedRecoveryState;
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/*
* InstallXLogFileSegmentActive indicates whether the checkpointer should
* arrange for future segments by recycling and/or PreallocXlogFiles().
* Protected by ControlFileLock. Only the startup process changes it. If
* true, anyone can use InstallXLogFileSegment(). If false, the startup
* process owns the exclusive right to install segments, by reading from
* the archive and possibly replacing existing files.
*/
bool InstallXLogFileSegmentActive;
/*
* WalWriterSleeping indicates whether the WAL writer is currently in
* low-power mode (and hence should be nudged if an async commit occurs).
* Protected by info_lck.
*/
bool WalWriterSleeping;
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/*
* During recovery, we keep a copy of the latest checkpoint record here.
* lastCheckPointRecPtr points to start of checkpoint record and
* lastCheckPointEndPtr points to end+1 of checkpoint record. Used by the
* checkpointer when it wants to create a restartpoint.
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
*
* Protected by info_lck.
*/
XLogRecPtr lastCheckPointRecPtr;
XLogRecPtr lastCheckPointEndPtr;
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
CheckPoint lastCheckPoint;
/*
* lastFpwDisableRecPtr points to the start of the last replayed
* XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
*/
XLogRecPtr lastFpwDisableRecPtr;
slock_t info_lck; /* locks shared variables shown above */
} XLogCtlData;
static XLogCtlData *XLogCtl = NULL;
/* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
static WALInsertLockPadded *WALInsertLocks = NULL;
/*
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
* We maintain an image of pg_control in shared memory.
*/
static ControlFileData *ControlFile = NULL;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Calculate the amount of space left on the page after 'endptr'. Beware
* multiple evaluation!
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
#define INSERT_FREESPACE(endptr) \
(((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
/* Macro to advance to next buffer index. */
#define NextBufIdx(idx) \
(((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
* would hold if it was in cache, the page containing 'recptr'.
*/
#define XLogRecPtrToBufIdx(recptr) \
(((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* These are the number of bytes in a WAL page usable for WAL data.
*/
#define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
/*
* Convert values of GUCs measured in megabytes to equiv. segment count.
* Rounds down.
*/
#define ConvertToXSegs(x, segsize) XLogMBVarToSegs((x), (segsize))
/* The number of bytes in a WAL segment usable for WAL data. */
static int UsableBytesInSegment;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Private, possibly out-of-date copy of shared LogwrtResult.
* See discussion above.
*/
static XLogwrtResult LogwrtResult = {0, 0};
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* openLogFile is -1 or a kernel FD for an open log file segment.
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
* openLogSegNo identifies the segment, and openLogTLI the corresponding TLI.
* These variables are only used to write the XLOG, and so will normally refer
* to the active segment.
*
Account explicitly for long-lived FDs that are allocated outside fd.c. The comments in fd.c have long claimed that all file allocations should go through that module, but in reality that's not always practical. fd.c doesn't supply APIs for invoking some FD-producing syscalls like pipe() or epoll_create(); and the APIs it does supply for non-virtual FDs are mostly insistent on releasing those FDs at transaction end; and in some cases the actual open() call is in code that can't be made to use fd.c, such as libpq. This has led to a situation where, in a modern server, there are likely to be seven or so long-lived FDs per backend process that are not known to fd.c. Since NUM_RESERVED_FDS is only 10, that meant we had *very* few spare FDs if max_files_per_process is >= the system ulimit and fd.c had opened all the files it thought it safely could. The contrib/postgres_fdw regression test, in particular, could easily be made to fall over by running it under a restrictive ulimit. To improve matters, invent functions Acquire/Reserve/ReleaseExternalFD that allow outside callers to tell fd.c that they have or want to allocate a FD that's not directly managed by fd.c. Add calls to track all the fixed FDs in a standard backend session, so that we are honestly guaranteeing that NUM_RESERVED_FDS FDs remain unused below the EMFILE limit in a backend's idle state. The coding rules for these functions say that there's no need to call them in code that just allocates one FD over a fairly short interval; we can dip into NUM_RESERVED_FDS for such cases. That means that there aren't all that many places where we need to worry. But postgres_fdw and dblink must use this facility to account for long-lived FDs consumed by libpq connections. There may be other places where it's worth doing such accounting, too, but this seems like enough to solve the immediate problem. Internally to fd.c, "external" FDs are limited to max_safe_fds/3 FDs. (Callers can choose to ignore this limit, but of course it's unwise to do so except for fixed file allocations.) I also reduced the limit on "allocated" files to max_safe_fds/3 FDs (it had been max_safe_fds/2). Conceivably a smarter rule could be used here --- but in practice, on reasonable systems, max_safe_fds should be large enough that this isn't much of an issue, so KISS for now. To avoid possible regression in the number of external or allocated files that can be opened, increase FD_MINFREE and the lower limit on max_files_per_process a little bit; we now insist that the effective "ulimit -n" be at least 64. This seems like pretty clearly a bug fix, but in view of the lack of field complaints, I'll refrain from risking a back-patch. Discussion: https://postgr.es/m/E1izCmM-0005pV-Co@gemulon.postgresql.org
2020-02-24 23:28:33 +01:00
* Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
static int openLogFile = -1;
static XLogSegNo openLogSegNo = 0;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
static TimeLineID openLogTLI = 0;
Prevent references to invalid relation pages after fresh promotion If a standby crashes after promotion before having completed its first post-recovery checkpoint, then the minimal recovery point which marks the LSN position where the cluster is able to reach consistency may be set to a position older than the first end-of-recovery checkpoint while all the WAL available should be replayed. This leads to the instance thinking that it contains inconsistent pages, causing a PANIC and a hard instance crash even if all the WAL available has not been replayed for certain sets of records replayed. When in crash recovery, minRecoveryPoint is expected to always be set to InvalidXLogRecPtr, which forces the recovery to replay all the WAL available, so this commit makes sure that the local copy of minRecoveryPoint from the control file is initialized properly and stays as it is while crash recovery is performed. Once switching to archive recovery or if crash recovery finishes, then the local copy minRecoveryPoint can be safely updated. Pavan Deolasee has reported and diagnosed the failure in the first place, and the base fix idea to rely on the local copy of minRecoveryPoint comes from Kyotaro Horiguchi, which has been expanded into a full-fledged patch by me. The test included in this commit has been written by Álvaro Herrera and Pavan Deolasee, which I have modified to make it faster and more reliable with sleep phases. Backpatch down to all supported versions where the bug appears, aka 9.3 which is where the end-of-recovery checkpoint is not run by the startup process anymore. The test gets easily supported down to 10, still it has been tested on all branches. Reported-by: Pavan Deolasee Diagnosed-by: Pavan Deolasee Reviewed-by: Pavan Deolasee, Kyotaro Horiguchi Author: Michael Paquier, Kyotaro Horiguchi, Pavan Deolasee, Álvaro Herrera Discussion: https://postgr.es/m/CABOikdPOewjNL=05K5CbNMxnNtXnQjhTx2F--4p4ruorCjukbA@mail.gmail.com
2018-07-05 03:46:18 +02:00
/*
* Local copies of equivalent fields in the control file. When running
* crash recovery, LocalMinRecoveryPoint is set to InvalidXLogRecPtr as we
Prevent references to invalid relation pages after fresh promotion If a standby crashes after promotion before having completed its first post-recovery checkpoint, then the minimal recovery point which marks the LSN position where the cluster is able to reach consistency may be set to a position older than the first end-of-recovery checkpoint while all the WAL available should be replayed. This leads to the instance thinking that it contains inconsistent pages, causing a PANIC and a hard instance crash even if all the WAL available has not been replayed for certain sets of records replayed. When in crash recovery, minRecoveryPoint is expected to always be set to InvalidXLogRecPtr, which forces the recovery to replay all the WAL available, so this commit makes sure that the local copy of minRecoveryPoint from the control file is initialized properly and stays as it is while crash recovery is performed. Once switching to archive recovery or if crash recovery finishes, then the local copy minRecoveryPoint can be safely updated. Pavan Deolasee has reported and diagnosed the failure in the first place, and the base fix idea to rely on the local copy of minRecoveryPoint comes from Kyotaro Horiguchi, which has been expanded into a full-fledged patch by me. The test included in this commit has been written by Álvaro Herrera and Pavan Deolasee, which I have modified to make it faster and more reliable with sleep phases. Backpatch down to all supported versions where the bug appears, aka 9.3 which is where the end-of-recovery checkpoint is not run by the startup process anymore. The test gets easily supported down to 10, still it has been tested on all branches. Reported-by: Pavan Deolasee Diagnosed-by: Pavan Deolasee Reviewed-by: Pavan Deolasee, Kyotaro Horiguchi Author: Michael Paquier, Kyotaro Horiguchi, Pavan Deolasee, Álvaro Herrera Discussion: https://postgr.es/m/CABOikdPOewjNL=05K5CbNMxnNtXnQjhTx2F--4p4ruorCjukbA@mail.gmail.com
2018-07-05 03:46:18 +02:00
* expect to replay all the WAL available, and updateMinRecoveryPoint is
* switched to false to prevent any updates while replaying records.
* Those values are kept consistent as long as crash recovery runs.
*/
static XLogRecPtr LocalMinRecoveryPoint;
static TimeLineID LocalMinRecoveryPointTLI;
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
static bool updateMinRecoveryPoint = true;
/* For WALInsertLockAcquire/Release functions */
static int MyLockNo = 0;
static bool holdingAllLocks = false;
#ifdef WAL_DEBUG
static MemoryContext walDebugCxt = NULL;
#endif
static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI,
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogRecPtr EndOfLog,
TimeLineID newTLI);
static void CheckRequiredParameterValues(void);
static void XLogReportParameters(void);
static int LocalSetXLogInsertAllowed(void);
static void CreateEndOfRecoveryRecord(void);
static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn,
XLogRecPtr missingContrecPtr,
TimeLineID newTLI);
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
static void AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli,
bool opportunistic);
static void XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible);
static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
bool find_free, XLogSegNo max_segno,
TimeLineID tli);
static void XLogFileClose(void);
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
static void PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli);
static void RemoveTempXlogFiles(void);
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr,
XLogRecPtr endptr, TimeLineID insertTLI);
static void RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo,
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogSegNo *endlogSegNo, TimeLineID insertTLI);
static void UpdateLastRemovedPtr(char *filename);
static void ValidateXLOGDirectoryStructure(void);
static void CleanupBackupHistory(void);
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
static bool PerformRecoveryXLogAction(void);
static void InitControlFile(uint64 sysidentifier);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
static void WriteControlFile(void);
static void ReadControlFile(void);
static void UpdateControlFile(void);
static char *str_time(pg_time_t tnow);
static void pg_start_backup_callback(int code, Datum arg);
static void pg_stop_backup_callback(int code, Datum arg);
static int get_sync_bit(int method);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
XLogRecData *rdata,
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogRecPtr StartPos, XLogRecPtr EndPos,
TimeLineID tli);
static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
XLogRecPtr *PrevPtr);
static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
static char *GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli);
static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
static void WALInsertLockAcquire(void);
static void WALInsertLockAcquireExclusive(void);
static void WALInsertLockRelease(void);
static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Insert an XLOG record represented by an already-constructed chain of data
* chunks. This is a low-level routine; to construct the WAL record header
* and data, use the higher-level routines in xloginsert.c.
*
* If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
* WAL record applies to, that were not included in the record as full page
* images. If fpw_lsn <= RedoRecPtr, the function does not perform the
* insertion and returns InvalidXLogRecPtr. The caller can then recalculate
* which pages need a full-page image, and retry. If fpw_lsn is invalid, the
* record is always inserted.
*
* 'flags' gives more in-depth control on the record being inserted. See
* XLogSetRecordFlags() for details.
*
* 'topxid_included' tells whether the top-transaction id is logged along with
* current subtransaction. See XLogRecordAssemble().
*
* The first XLogRecData in the chain must be for the record header, and its
* data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and
* xl_crc fields in the header, the rest of the header must already be filled
* by the caller.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*
* Returns XLOG pointer to end of record (beginning of next record).
* This can be used as LSN for data pages affected by the logged action.
* (LSN is the XLOG point up to which the XLOG must be flushed to disk
* before the data page can be written out. This implements the basic
* WAL rule "write the log before the data".)
*/
XLogRecPtr
XLogInsertRecord(XLogRecData *rdata,
XLogRecPtr fpw_lsn,
uint8 flags,
int num_fpi,
bool topxid_included)
{
XLogCtlInsert *Insert = &XLogCtl->Insert;
pg_crc32c rdata_crc;
bool inserted;
XLogRecord *rechdr = (XLogRecord *) rdata->data;
uint8 info = rechdr->xl_info & ~XLR_INFO_MASK;
bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
info == XLOG_SWITCH);
XLogRecPtr StartPos;
XLogRecPtr EndPos;
bool prevDoPageWrites = doPageWrites;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
TimeLineID insertTLI;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
/* we assume that all of the record header is in the first chunk */
Assert(rdata->len >= SizeOfXLogRecord);
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/* cross-check on whether we should be here or not */
if (!XLogInsertAllowed())
elog(ERROR, "cannot make new WAL entries during recovery");
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
/*
* Given that we're not in recovery, InsertTimeLineID is set and can't
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
* change, so we can read it without a lock.
*/
insertTLI = XLogCtl->InsertTimeLineID;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
/*----------
*
* We have now done all the preparatory work we can without holding a
* lock or modifying shared state. From here on, inserting the new WAL
* record to the shared WAL buffer cache is a two-step process:
*
* 1. Reserve the right amount of space from the WAL. The current head of
* reserved space is kept in Insert->CurrBytePos, and is protected by
* insertpos_lck.
*
* 2. Copy the record to the reserved WAL space. This involves finding the
* correct WAL buffer containing the reserved space, and copying the
* record in place. This can be done concurrently in multiple processes.
*
* To keep track of which insertions are still in-progress, each concurrent
* inserter acquires an insertion lock. In addition to just indicating that
* an insertion is in progress, the lock tells others how far the inserter
* has progressed. There is a small fixed number of insertion locks,
* determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
* boundary, it updates the value stored in the lock to the how far it has
* inserted, to allow the previous buffer to be flushed.
*
* Holding onto an insertion lock also protects RedoRecPtr and
* fullPageWrites from changing until the insertion is finished.
*
* Step 2 can usually be done completely in parallel. If the required WAL
* page is not initialized yet, you have to grab WALBufMappingLock to
* initialize it, but the WAL writer tries to do that ahead of insertions
* to avoid that from happening in the critical path.
*
*----------
*/
START_CRIT_SECTION();
if (isLogSwitch)
WALInsertLockAcquireExclusive();
else
WALInsertLockAcquire();
/*
* Check to see if my copy of RedoRecPtr is out of date. If so, may have
* to go back and have the caller recompute everything. This can only
* happen just after a checkpoint, so it's better to be slow in this case
* and fast otherwise.
*
* Also check to see if fullPageWrites or forcePageWrites was just turned
* on; if we weren't already doing full-page writes then go back and
* recompute.
*
* If we aren't doing full-page writes then RedoRecPtr doesn't actually
* affect the contents of the XLOG record, so we'll update our local copy
* but not force a recomputation. (If doPageWrites was just turned off,
* we could recompute the record without full pages, but we choose not to
* bother.)
*/
if (RedoRecPtr != Insert->RedoRecPtr)
{
Assert(RedoRecPtr < Insert->RedoRecPtr);
RedoRecPtr = Insert->RedoRecPtr;
}
doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
if (doPageWrites &&
(!prevDoPageWrites ||
(fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
{
/*
* Oops, some buffer now needs to be backed up that the caller didn't
* back up. Start over.
*/
WALInsertLockRelease();
END_CRIT_SECTION();
return InvalidXLogRecPtr;
}
/*
* Reserve space for the record in the WAL. This also sets the xl_prev
* pointer.
*/
if (isLogSwitch)
inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
else
{
ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
&rechdr->xl_prev);
inserted = true;
}
if (inserted)
{
/*
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
* Now that xl_prev has been filled in, calculate CRC of the record
* header.
*/
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
rdata_crc = rechdr->xl_crc;
COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
FIN_CRC32C(rdata_crc);
rechdr->xl_crc = rdata_crc;
/*
* All the record data, including the header, is now ready to be
* inserted. Copy the record in the space reserved.
*/
CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
StartPos, EndPos, insertTLI);
/*
* Unless record is flagged as not important, update LSN of last
* important record in the current slot. When holding all locks, just
* update the first one.
*/
if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
{
int lockno = holdingAllLocks ? 0 : MyLockNo;
WALInsertLocks[lockno].l.lastImportantAt = StartPos;
}
}
else
{
/*
* This was an xlog-switch record, but the current insert location was
* already exactly at the beginning of a segment, so there was no need
* to do anything.
*/
}
/*
* Done! Let others know that we're finished.
*/
WALInsertLockRelease();
END_CRIT_SECTION();
MarkCurrentTransactionIdLoggedIfAny();
/*
* Mark top transaction id is logged (if needed) so that we should not try
* to log it again with the next WAL record in the current subtransaction.
*/
if (topxid_included)
MarkSubxactTopXidLogged();
/*
* Update shared LogwrtRqst.Write, if we crossed page boundary.
*/
if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
{
SpinLockAcquire(&XLogCtl->info_lck);
/* advance global request to include new block(s) */
if (XLogCtl->LogwrtRqst.Write < EndPos)
XLogCtl->LogwrtRqst.Write = EndPos;
/* update local result copy while I have the chance */
LogwrtResult = XLogCtl->LogwrtResult;
SpinLockRelease(&XLogCtl->info_lck);
}
/*
* If this was an XLOG_SWITCH record, flush the record and the empty
* padding space that fills the rest of the segment, and perform
* end-of-segment actions (eg, notifying archiver).
*/
if (isLogSwitch)
{
TRACE_POSTGRESQL_WAL_SWITCH();
XLogFlush(EndPos);
/*
* Even though we reserved the rest of the segment for us, which is
* reflected in EndPos, we return a pointer to just the end of the
* xlog-switch record.
*/
if (inserted)
{
EndPos = StartPos + SizeOfXLogRecord;
if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
{
uint64 offset = XLogSegmentOffset(EndPos, wal_segment_size);
if (offset == EndPos % XLOG_BLCKSZ)
EndPos += SizeOfXLogLongPHD;
else
EndPos += SizeOfXLogShortPHD;
}
}
}
#ifdef WAL_DEBUG
if (XLOG_DEBUG)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
static XLogReaderState *debug_reader = NULL;
XLogRecord *record;
DecodedXLogRecord *decoded;
StringInfoData buf;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
StringInfoData recordBuf;
char *errormsg = NULL;
MemoryContext oldCxt;
oldCxt = MemoryContextSwitchTo(walDebugCxt);
initStringInfo(&buf);
appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos));
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
/*
* We have to piece together the WAL record data from the XLogRecData
* entries, so that we can pass it to the rm_desc function as one
* contiguous chunk.
*/
initStringInfo(&recordBuf);
for (; rdata != NULL; rdata = rdata->next)
appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
/* We also need temporary space to decode the record. */
record = (XLogRecord *) recordBuf.data;
decoded = (DecodedXLogRecord *)
palloc(DecodeXLogRecordRequiredSpace(record->xl_tot_len));
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (!debug_reader)
debug_reader = XLogReaderAllocate(wal_segment_size, NULL,
XL_ROUTINE(), NULL);
if (!debug_reader)
{
appendStringInfoString(&buf, "error decoding record: out of memory while allocating a WAL reading processor");
}
else if (!DecodeXLogRecord(debug_reader,
decoded,
record,
EndPos,
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
&errormsg))
{
appendStringInfo(&buf, "error decoding record: %s",
errormsg ? errormsg : "no error message");
}
else
{
appendStringInfoString(&buf, " - ");
debug_reader->record = decoded;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
xlog_outdesc(&buf, debug_reader);
debug_reader->record = NULL;
}
elog(LOG, "%s", buf.data);
pfree(decoded);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
pfree(buf.data);
pfree(recordBuf.data);
MemoryContextSwitchTo(oldCxt);
}
#endif
/*
* Update our global variables
*/
ProcLastRecPtr = StartPos;
XactLastRecEnd = EndPos;
/* Report WAL traffic to the instrumentation. */
if (inserted)
{
pgWalUsage.wal_bytes += rechdr->xl_tot_len;
pgWalUsage.wal_records++;
pgWalUsage.wal_fpi += num_fpi;
}
return EndPos;
}
/*
* Reserves the right amount of space for a record of given size from the WAL.
* *StartPos is set to the beginning of the reserved section, *EndPos to
* its end+1. *PrevPtr is set to the beginning of the previous record; it is
* used to set the xl_prev of this record.
*
* This is the performance critical part of XLogInsert that must be serialized
* across backends. The rest can happen mostly in parallel. Try to keep this
* section as short as possible, insertpos_lck can be heavily contended on a
* busy system.
*
* NB: The space calculation here must match the code in CopyXLogRecordToWAL,
* where we actually copy the record to the reserved space.
*/
static void
ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
XLogRecPtr *PrevPtr)
{
XLogCtlInsert *Insert = &XLogCtl->Insert;
uint64 startbytepos;
uint64 endbytepos;
uint64 prevbytepos;
size = MAXALIGN(size);
/* All (non xlog-switch) records should contain data. */
Assert(size > SizeOfXLogRecord);
/*
* The duration the spinlock needs to be held is minimized by minimizing
* the calculations that have to be done while holding the lock. The
* current tip of reserved WAL is kept in CurrBytePos, as a byte position
* that only counts "usable" bytes in WAL, that is, it excludes all WAL
* page headers. The mapping between "usable" byte positions and physical
* positions (XLogRecPtrs) can be done outside the locked region, and
* because the usable byte position doesn't include any headers, reserving
* X bytes from WAL is almost as simple as "CurrBytePos += X".
*/
SpinLockAcquire(&Insert->insertpos_lck);
startbytepos = Insert->CurrBytePos;
endbytepos = startbytepos + size;
prevbytepos = Insert->PrevBytePos;
Insert->CurrBytePos = endbytepos;
Insert->PrevBytePos = startbytepos;
SpinLockRelease(&Insert->insertpos_lck);
*StartPos = XLogBytePosToRecPtr(startbytepos);
*EndPos = XLogBytePosToEndRecPtr(endbytepos);
*PrevPtr = XLogBytePosToRecPtr(prevbytepos);
/*
* Check that the conversions between "usable byte positions" and
* XLogRecPtrs work consistently in both directions.
*/
Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
}
/*
* Like ReserveXLogInsertLocation(), but for an xlog-switch record.
*
* A log-switch record is handled slightly differently. The rest of the
* segment will be reserved for this insertion, as indicated by the returned
* *EndPos value. However, if we are already at the beginning of the current
* segment, *StartPos and *EndPos are set to the current location without
* reserving any space, and the function returns false.
*/
static bool
ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
{
XLogCtlInsert *Insert = &XLogCtl->Insert;
uint64 startbytepos;
uint64 endbytepos;
uint64 prevbytepos;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
uint32 size = MAXALIGN(SizeOfXLogRecord);
XLogRecPtr ptr;
uint32 segleft;
/*
* These calculations are a bit heavy-weight to be done while holding a
* spinlock, but since we're holding all the WAL insertion locks, there
* are no other inserters competing for it. GetXLogInsertRecPtr() does
* compete for it, but that's not called very frequently.
*/
SpinLockAcquire(&Insert->insertpos_lck);
startbytepos = Insert->CurrBytePos;
ptr = XLogBytePosToEndRecPtr(startbytepos);
if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
{
SpinLockRelease(&Insert->insertpos_lck);
*EndPos = *StartPos = ptr;
return false;
}
endbytepos = startbytepos + size;
prevbytepos = Insert->PrevBytePos;
*StartPos = XLogBytePosToRecPtr(startbytepos);
*EndPos = XLogBytePosToEndRecPtr(endbytepos);
segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
if (segleft != wal_segment_size)
{
/* consume the rest of the segment */
*EndPos += segleft;
endbytepos = XLogRecPtrToBytePos(*EndPos);
}
Insert->CurrBytePos = endbytepos;
Insert->PrevBytePos = startbytepos;
SpinLockRelease(&Insert->insertpos_lck);
*PrevPtr = XLogBytePosToRecPtr(prevbytepos);
Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
return true;
}
/*
* Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved
* area in the WAL.
*/
static void
CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogRecPtr StartPos, XLogRecPtr EndPos, TimeLineID tli)
{
char *currpos;
int freespace;
int written;
XLogRecPtr CurrPos;
XLogPageHeader pagehdr;
/*
* Get a pointer to the right place in the right WAL buffer to start
* inserting to.
*/
CurrPos = StartPos;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
currpos = GetXLogBuffer(CurrPos, tli);
freespace = INSERT_FREESPACE(CurrPos);
/*
* there should be enough space for at least the first field (xl_tot_len)
* on this page.
*/
Assert(freespace >= sizeof(uint32));
/* Copy record data */
written = 0;
while (rdata != NULL)
{
char *rdata_data = rdata->data;
int rdata_len = rdata->len;
while (rdata_len > freespace)
{
/*
* Write what fits on this page, and continue on the next page.
*/
Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
memcpy(currpos, rdata_data, freespace);
rdata_data += freespace;
rdata_len -= freespace;
written += freespace;
CurrPos += freespace;
/*
* Get pointer to beginning of next page, and set the xlp_rem_len
* in the page header. Set XLP_FIRST_IS_CONTRECORD.
*
* It's safe to set the contrecord flag and xlp_rem_len without a
* lock on the page. All the other flags were already set when the
* page was initialized, in AdvanceXLInsertBuffer, and we're the
* only backend that needs to set the contrecord flag.
*/
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
currpos = GetXLogBuffer(CurrPos, tli);
pagehdr = (XLogPageHeader) currpos;
pagehdr->xlp_rem_len = write_len - written;
pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
/* skip over the page header */
if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
{
CurrPos += SizeOfXLogLongPHD;
currpos += SizeOfXLogLongPHD;
}
else
{
CurrPos += SizeOfXLogShortPHD;
currpos += SizeOfXLogShortPHD;
}
freespace = INSERT_FREESPACE(CurrPos);
}
Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
memcpy(currpos, rdata_data, rdata_len);
currpos += rdata_len;
CurrPos += rdata_len;
freespace -= rdata_len;
written += rdata_len;
rdata = rdata->next;
}
Assert(written == write_len);
/*
* If this was an xlog-switch, it's not enough to write the switch record,
* we also have to consume all the remaining space in the WAL segment. We
* have already reserved that space, but we need to actually fill it.
*/
if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
{
/* An xlog-switch record doesn't contain any data besides the header */
Assert(write_len == SizeOfXLogRecord);
/* Assert that we did reserve the right amount of space */
Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
/* Use up all the remaining space on the current page */
CurrPos += freespace;
/*
* Cause all remaining pages in the segment to be flushed, leaving the
* XLog position where it should be, at the start of the next segment.
* We do this one page at a time, to make sure we don't deadlock
* against ourselves if wal_buffers < wal_segment_size.
*/
while (CurrPos < EndPos)
{
/*
* The minimal action to flush the page would be to call
* WALInsertLockUpdateInsertingAt(CurrPos) followed by
* AdvanceXLInsertBuffer(...). The page would be left initialized
* mostly to zeros, except for the page header (always the short
* variant, as this is never a segment's first page).
*
* The large vistas of zeros are good for compressibility, but the
* headers interrupting them every XLOG_BLCKSZ (with values that
* differ from page to page) are not. The effect varies with
* compression tool, but bzip2 for instance compresses about an
* order of magnitude worse if those headers are left in place.
*
* Rather than complicating AdvanceXLInsertBuffer itself (which is
* called in heavily-loaded circumstances as well as this lightly-
* loaded one) with variant behavior, we just use GetXLogBuffer
* (which itself calls the two methods we need) to get the pointer
* and zero most of the page. Then we just zero the page header.
*/
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
currpos = GetXLogBuffer(CurrPos, tli);
MemSet(currpos, 0, SizeOfXLogShortPHD);
CurrPos += XLOG_BLCKSZ;
}
}
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
else
{
/* Align the end position, so that the next record starts aligned */
CurrPos = MAXALIGN64(CurrPos);
}
if (CurrPos != EndPos)
elog(PANIC, "space reserved for WAL record does not match what was written");
}
/*
* Acquire a WAL insertion lock, for inserting to WAL.
*/
static void
WALInsertLockAcquire(void)
{
bool immed;
/*
* It doesn't matter which of the WAL insertion locks we acquire, so try
* the one we used last time. If the system isn't particularly busy, it's
* a good bet that it's still available, and it's good to have some
* affinity to a particular lock so that you don't unnecessarily bounce
* cache lines between processes when there's no contention.
*
* If this is the first time through in this backend, pick a lock
* (semi-)randomly. This allows the locks to be used evenly if you have a
* lot of very short connections.
*/
static int lockToTry = -1;
if (lockToTry == -1)
lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
MyLockNo = lockToTry;
/*
* The insertingAt value is initially set to 0, as we don't know our
* insert location yet.
*/
immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
if (!immed)
{
/*
* If we couldn't get the lock immediately, try another lock next
* time. On a system with more insertion locks than concurrent
* inserters, this causes all the inserters to eventually migrate to a
* lock that no-one else is using. On a system with more inserters
* than locks, it still helps to distribute the inserters evenly
* across the locks.
*/
lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
}
}
/*
* Acquire all WAL insertion locks, to prevent other backends from inserting
* to WAL.
*/
static void
WALInsertLockAcquireExclusive(void)
{
int i;
/*
* When holding all the locks, all but the last lock's insertingAt
* indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
* XLogRecPtr value, to make sure that no-one blocks waiting on those.
*/
for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
{
LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
LWLockUpdateVar(&WALInsertLocks[i].l.lock,
&WALInsertLocks[i].l.insertingAt,
PG_UINT64_MAX);
}
/* Variable value reset to 0 at release */
LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
holdingAllLocks = true;
}
/*
* Release our insertion lock (or locks, if we're holding them all).
*
* NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
* next time the lock is acquired.
*/
static void
WALInsertLockRelease(void)
{
if (holdingAllLocks)
{
int i;
for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
&WALInsertLocks[i].l.insertingAt,
0);
holdingAllLocks = false;
}
else
{
LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
&WALInsertLocks[MyLockNo].l.insertingAt,
0);
}
}
/*
* Update our insertingAt value, to let others know that we've finished
* inserting up to that point.
*/
static void
WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
{
if (holdingAllLocks)
{
/*
* We use the last lock to mark our actual position, see comments in
* WALInsertLockAcquireExclusive.
*/
LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
insertingAt);
}
else
LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
&WALInsertLocks[MyLockNo].l.insertingAt,
insertingAt);
}
/*
* Wait for any WAL insertions < upto to finish.
*
* Returns the location of the oldest insertion that is still in-progress.
* Any WAL prior to that point has been fully copied into WAL buffers, and
* can be flushed out to disk. Because this waits for any insertions older
* than 'upto' to finish, the return value is always >= 'upto'.
*
* Note: When you are about to write out WAL, you must call this function
* *before* acquiring WALWriteLock, to avoid deadlocks. This function might
* need to wait for an insertion to finish (or at least advance to next
* uninitialized page), and the inserter might need to evict an old WAL buffer
* to make room for a new one, which in turn requires WALWriteLock.
*/
static XLogRecPtr
WaitXLogInsertionsToFinish(XLogRecPtr upto)
{
uint64 bytepos;
XLogRecPtr reservedUpto;
XLogRecPtr finishedUpto;
XLogCtlInsert *Insert = &XLogCtl->Insert;
int i;
if (MyProc == NULL)
elog(PANIC, "cannot wait without a PGPROC structure");
/* Read the current insert position */
SpinLockAcquire(&Insert->insertpos_lck);
bytepos = Insert->CurrBytePos;
SpinLockRelease(&Insert->insertpos_lck);
reservedUpto = XLogBytePosToEndRecPtr(bytepos);
/*
* No-one should request to flush a piece of WAL that hasn't even been
* reserved yet. However, it can happen if there is a block with a bogus
* LSN on disk, for example. XLogFlush checks for that situation and
* complains, but only after the flush. Here we just assume that to mean
* that all WAL that has been reserved needs to be finished. In this
* corner-case, the return value can be smaller than 'upto' argument.
*/
if (upto > reservedUpto)
{
ereport(LOG,
(errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X",
LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto))));
upto = reservedUpto;
}
/*
* Loop through all the locks, sleeping on any in-progress insert older
* than 'upto'.
*
* finishedUpto is our return value, indicating the point upto which all
* the WAL insertions have been finished. Initialize it to the head of
* reserved WAL, and as we iterate through the insertion locks, back it
* out for any insertion that's still in progress.
*/
finishedUpto = reservedUpto;
for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
{
XLogRecPtr insertingat = InvalidXLogRecPtr;
do
{
/*
* See if this insertion is in progress. LWLockWaitForVar will
* wait for the lock to be released, or for the 'value' to be set
* by a LWLockUpdateVar call. When a lock is initially acquired,
* its value is 0 (InvalidXLogRecPtr), which means that we don't
* know where it's inserting yet. We will have to wait for it. If
* it's a small insertion, the record will most likely fit on the
* same page and the inserter will release the lock without ever
* calling LWLockUpdateVar. But if it has to sleep, it will
* advertise the insertion point with LWLockUpdateVar before
* sleeping.
*/
if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
&WALInsertLocks[i].l.insertingAt,
insertingat, &insertingat))
{
/* the lock was free, so no insertion in progress */
insertingat = InvalidXLogRecPtr;
break;
}
/*
* This insertion is still in progress. Have to wait, unless the
* inserter has proceeded past 'upto'.
*/
} while (insertingat < upto);
if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
finishedUpto = insertingat;
}
return finishedUpto;
}
/*
* Get a pointer to the right location in the WAL buffer containing the
* given XLogRecPtr.
*
* If the page is not initialized yet, it is initialized. That might require
* evicting an old dirty buffer from the buffer cache, which means I/O.
*
* The caller must ensure that the page containing the requested location
* isn't evicted yet, and won't be evicted. The way to ensure that is to
* hold onto a WAL insertion lock with the insertingAt position set to
* something <= ptr. GetXLogBuffer() will update insertingAt if it needs
* to evict an old page from the buffer. (This means that once you call
* GetXLogBuffer() with a given 'ptr', you must not access anything before
* that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
* later, because older buffers might be recycled already)
*/
static char *
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
{
int idx;
XLogRecPtr endptr;
static uint64 cachedPage = 0;
static char *cachedPos = NULL;
XLogRecPtr expectedEndPtr;
/*
* Fast path for the common case that we need to access again the same
* page as last time.
*/
if (ptr / XLOG_BLCKSZ == cachedPage)
{
Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
return cachedPos + ptr % XLOG_BLCKSZ;
}
/*
* The XLog buffer cache is organized so that a page is always loaded to a
* particular buffer. That way we can easily calculate the buffer a given
* page must be loaded into, from the XLogRecPtr alone.
*/
idx = XLogRecPtrToBufIdx(ptr);
/*
* See what page is loaded in the buffer at the moment. It could be the
* page we're looking for, or something older. It can't be anything newer
* - that would imply the page we're looking for has already been written
* out to disk and evicted, and the caller is responsible for making sure
* that doesn't happen.
*
* However, we don't hold a lock while we read the value. If someone has
* just initialized the page, it's possible that we get a "torn read" of
* the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
* that case we will see a bogus value. That's ok, we'll grab the mapping
* lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
* the page we're looking for. But it means that when we do this unlocked
* read, we might see a value that appears to be ahead of the page we're
* looking for. Don't PANIC on that, until we've verified the value while
* holding the lock.
*/
expectedEndPtr = ptr;
expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
endptr = XLogCtl->xlblocks[idx];
if (expectedEndPtr != endptr)
{
XLogRecPtr initializedUpto;
/*
* Before calling AdvanceXLInsertBuffer(), which can block, let others
* know how far we're finished with inserting the record.
*
* NB: If 'ptr' points to just after the page header, advertise a
* position at the beginning of the page rather than 'ptr' itself. If
* there are no other insertions running, someone might try to flush
* up to our advertised location. If we advertised a position after
* the page header, someone might try to flush the page header, even
* though page might actually not be initialized yet. As the first
* inserter on the page, we are effectively responsible for making
* sure that it's initialized, before we let insertingAt to move past
* the page header.
*/
if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
initializedUpto = ptr - SizeOfXLogShortPHD;
else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
initializedUpto = ptr - SizeOfXLogLongPHD;
else
initializedUpto = ptr;
WALInsertLockUpdateInsertingAt(initializedUpto);
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
AdvanceXLInsertBuffer(ptr, tli, false);
endptr = XLogCtl->xlblocks[idx];
if (expectedEndPtr != endptr)
elog(PANIC, "could not find WAL buffer for %X/%X",
LSN_FORMAT_ARGS(ptr));
}
else
{
/*
* Make sure the initialization of the page is visible to us, and
* won't arrive later to overwrite the WAL data we write on the page.
*/
pg_memory_barrier();
}
/*
* Found the buffer holding this page. Return a pointer to the right
* offset within the page.
*/
cachedPage = ptr / XLOG_BLCKSZ;
cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
return cachedPos + ptr % XLOG_BLCKSZ;
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Converts a "usable byte position" to XLogRecPtr. A usable byte position
* is the position starting from the beginning of WAL, excluding all WAL
* page headers.
*/
static XLogRecPtr
XLogBytePosToRecPtr(uint64 bytepos)
{
uint64 fullsegs;
uint64 fullpages;
uint64 bytesleft;
uint32 seg_offset;
XLogRecPtr result;
fullsegs = bytepos / UsableBytesInSegment;
bytesleft = bytepos % UsableBytesInSegment;
if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2000-10-21 17:43:36 +02:00
{
/* fits on first page of segment */
seg_offset = bytesleft + SizeOfXLogLongPHD;
2000-10-21 17:43:36 +02:00
}
else
{
/* account for the first page on segment with long header */
seg_offset = XLOG_BLCKSZ;
bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
fullpages = bytesleft / UsableBytesInPage;
bytesleft = bytesleft % UsableBytesInPage;
seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
}
XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
return result;
}
2000-06-02 12:20:27 +02:00
/*
* Like XLogBytePosToRecPtr, but if the position is at a page boundary,
* returns a pointer to the beginning of the page (ie. before page header),
* not to where the first xlog record on that page would go to. This is used
* when converting a pointer to the end of a record.
*/
static XLogRecPtr
XLogBytePosToEndRecPtr(uint64 bytepos)
{
uint64 fullsegs;
uint64 fullpages;
uint64 bytesleft;
uint32 seg_offset;
XLogRecPtr result;
fullsegs = bytepos / UsableBytesInSegment;
bytesleft = bytepos % UsableBytesInSegment;
if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
{
/* fits on first page of segment */
if (bytesleft == 0)
seg_offset = 0;
else
seg_offset = bytesleft + SizeOfXLogLongPHD;
}
else
{
/* account for the first page on segment with long header */
seg_offset = XLOG_BLCKSZ;
bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
fullpages = bytesleft / UsableBytesInPage;
bytesleft = bytesleft % UsableBytesInPage;
if (bytesleft == 0)
seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
else
seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
}
XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
return result;
}
/*
* Convert an XLogRecPtr to a "usable byte position".
*/
static uint64
XLogRecPtrToBytePos(XLogRecPtr ptr)
{
uint64 fullsegs;
uint32 fullpages;
uint32 offset;
uint64 result;
XLByteToSeg(ptr, fullsegs, wal_segment_size);
fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
offset = ptr % XLOG_BLCKSZ;
if (fullpages == 0)
{
result = fullsegs * UsableBytesInSegment;
if (offset > 0)
{
Assert(offset >= SizeOfXLogLongPHD);
result += offset - SizeOfXLogLongPHD;
}
}
else
{
result = fullsegs * UsableBytesInSegment +
(XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
(fullpages - 1) * UsableBytesInPage; /* full pages */
if (offset > 0)
{
Assert(offset >= SizeOfXLogShortPHD);
result += offset - SizeOfXLogShortPHD;
}
}
return result;
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Initialize XLOG buffers, writing out old buffers if they still contain
* unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
* true, initialize as many pages as we can without having to write out
* unwritten data. Any new pages are initialized to zeros, with pages headers
* initialized properly.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
static void
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
{
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
XLogCtlInsert *Insert = &XLogCtl->Insert;
int nextidx;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
XLogRecPtr OldPageRqstPtr;
XLogwrtRqst WriteRqst;
XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr;
XLogRecPtr NewPageBeginPtr;
XLogPageHeader NewPage;
int npages = 0;
LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Now that we have the lock, check if someone initialized the page
* already.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
while (upto >= XLogCtl->InitializedUpTo || opportunistic)
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
{
nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
/*
* Get ending-offset of the buffer page we need to replace (this may
* be zero if the buffer hasn't been used yet). Fall through if it's
* already written out.
*/
OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
if (LogwrtResult.Write < OldPageRqstPtr)
{
/*
* Nope, got work to do. If we just want to pre-initialize as much
* as we can without flushing, give up now.
*/
if (opportunistic)
break;
/* Before waiting, get info_lck and update LogwrtResult */
SpinLockAcquire(&XLogCtl->info_lck);
if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
LogwrtResult = XLogCtl->LogwrtResult;
SpinLockRelease(&XLogCtl->info_lck);
/*
* Now that we have an up-to-date LogwrtResult value, see if we
* still need to write it or if someone else already did.
*/
if (LogwrtResult.Write < OldPageRqstPtr)
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
{
/*
* Must acquire write lock. Release WALBufMappingLock first,
* to make sure that all insertions that we need to wait for
* can finish (up to this same position). Otherwise we risk
* deadlock.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
LWLockRelease(WALBufMappingLock);
WaitXLogInsertionsToFinish(OldPageRqstPtr);
LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
LogwrtResult = XLogCtl->LogwrtResult;
if (LogwrtResult.Write >= OldPageRqstPtr)
{
/* OK, someone wrote it already */
LWLockRelease(WALWriteLock);
}
else
{
/* Have to write it ourselves */
TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
WriteRqst.Write = OldPageRqstPtr;
WriteRqst.Flush = 0;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogWrite(WriteRqst, tli, false);
LWLockRelease(WALWriteLock);
WalStats.m_wal_buffers_full++;
TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
}
/* Re-acquire WALBufMappingLock and retry */
LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
continue;
}
}
/*
* Now the next buffer slot is free and we can set it up to be the
* next output page.
*/
NewPageBeginPtr = XLogCtl->InitializedUpTo;
NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2005-10-15 04:49:52 +02:00
/*
* Be sure to re-zero the buffer so that bytes beyond what we've
* written will look like zeroes and not valid XLOG records...
*/
MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2005-10-15 04:49:52 +02:00
/*
* Fill the new page's header
*/
NewPage->xlp_magic = XLOG_PAGE_MAGIC;
2001-03-22 05:01:46 +01:00
/* NewPage->xlp_info = 0; */ /* done by memset */
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
NewPage->xlp_tli = tli;
NewPage->xlp_pageaddr = NewPageBeginPtr;
/* NewPage->xlp_rem_len = 0; */ /* done by memset */
/*
* If online backup is not in progress, mark the header to indicate
* that WAL records beginning in this page have removable backup
* blocks. This allows the WAL archiver to know whether it is safe to
* compress archived WAL data by transforming full-block records into
* the non-full-block format. It is sufficient to record this at the
* page level because we force a page switch (in fact a segment
* switch) when starting a backup, so the flag will be off before any
* records can be written during the backup. At the end of a backup,
* the last page will be marked as all unsafe when perhaps only part
* is unsafe, but at worst the archiver would miss the opportunity to
* compress a few records.
*/
if (!Insert->forcePageWrites)
NewPage->xlp_info |= XLP_BKP_REMOVABLE;
2005-10-15 04:49:52 +02:00
/*
* If first page of an XLOG segment file, make it a long header.
*/
if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
{
XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
NewLongPage->xlp_sysid = ControlFile->system_identifier;
NewLongPage->xlp_seg_size = wal_segment_size;
NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
NewPage->xlp_info |= XLP_LONG_HEADER;
}
/*
* Make sure the initialization of the page becomes visible to others
* before the xlblocks update. GetXLogBuffer() reads xlblocks without
* holding a lock.
*/
pg_write_barrier();
*((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
XLogCtl->InitializedUpTo = NewPageEndPtr;
2005-10-15 04:49:52 +02:00
npages++;
}
LWLockRelease(WALBufMappingLock);
#ifdef WAL_DEBUG
if (XLOG_DEBUG && npages > 0)
{
elog(DEBUG1, "initialized %d pages, up to %X/%X",
npages, LSN_FORMAT_ARGS(NewPageEndPtr));
}
#endif
}
/*
* Calculate CheckPointSegments based on max_wal_size_mb and
* checkpoint_completion_target.
*/
static void
CalculateCheckpointSegments(void)
{
double target;
/*-------
* Calculate the distance at which to trigger a checkpoint, to avoid
* exceeding max_wal_size_mb. This is based on two assumptions:
*
* a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
* WAL for two checkpoint cycles to allow us to recover from the
* secondary checkpoint if the first checkpoint failed, though we
* only did this on the primary anyway, not on standby. Keeping just
* one checkpoint simplifies processing and reduces disk space in
* many smaller databases.)
* b) during checkpoint, we consume checkpoint_completion_target *
* number of segments consumed between checkpoints.
*-------
*/
target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
(1.0 + CheckPointCompletionTarget);
/* round down */
CheckPointSegments = (int) target;
if (CheckPointSegments < 1)
CheckPointSegments = 1;
}
void
assign_max_wal_size(int newval, void *extra)
{
max_wal_size_mb = newval;
CalculateCheckpointSegments();
}
void
assign_checkpoint_completion_target(double newval, void *extra)
{
CheckPointCompletionTarget = newval;
CalculateCheckpointSegments();
}
/*
* At a checkpoint, how many WAL segments to recycle as preallocated future
* XLOG segments? Returns the highest segment that should be preallocated.
*/
static XLogSegNo
XLOGfileslop(XLogRecPtr lastredoptr)
{
XLogSegNo minSegNo;
XLogSegNo maxSegNo;
double distance;
XLogSegNo recycleSegNo;
/*
* Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
* correspond to. Always recycle enough segments to meet the minimum, and
* remove enough segments to stay below the maximum.
*/
minSegNo = lastredoptr / wal_segment_size +
ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
maxSegNo = lastredoptr / wal_segment_size +
ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
/*
* Between those limits, recycle enough segments to get us through to the
* estimated end of next checkpoint.
*
* To estimate where the next checkpoint will finish, assume that the
* system runs steadily consuming CheckPointDistanceEstimate bytes between
* every checkpoint.
*/
distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
/* add 10% for good measure. */
distance *= 1.10;
recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
wal_segment_size);
if (recycleSegNo < minSegNo)
recycleSegNo = minSegNo;
if (recycleSegNo > maxSegNo)
recycleSegNo = maxSegNo;
return recycleSegNo;
}
/*
* Check whether we've consumed enough xlog space that a checkpoint is needed.
*
* new_segno indicates a log file that has just been filled up (or read
* during recovery). We measure the distance from RedoRecPtr to new_segno
* and see if that exceeds CheckPointSegments.
*
* Note: it is caller's responsibility that RedoRecPtr is up-to-date.
*/
bool
XLogCheckpointNeeded(XLogSegNo new_segno)
{
XLogSegNo old_segno;
XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
return true;
return false;
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Write and/or fsync the log at least as far as WriteRqst indicates.
*
* If flexible == true, we don't have to write as far as WriteRqst, but
* may stop at any convenient boundary (such as a cache or logfile boundary).
* This option allows us to avoid uselessly issuing multiple writes when a
* single one would do.
*
* Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
* must be called before grabbing the lock, to make sure the data is ready to
* write.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
static void
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
{
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
bool ispartialpage;
bool last_iteration;
bool finishing_seg;
int curridx;
int npages;
int startidx;
uint32 startoffset;
/* We should always be inside a critical section here */
Assert(CritSectionCount > 0);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Update local LogwrtResult (caller probably did this already, but...)
*/
LogwrtResult = XLogCtl->LogwrtResult;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Since successive pages in the xlog cache are consecutively allocated,
* we can usually gather multiple pages together and issue just one
* write() call. npages is the number of pages we have determined can be
* written together; startidx is the cache block index of the first one,
* and startoffset is the file offset at which it should go. The latter
* two variables are only valid when npages > 0, but we must initialize
* all of them to keep the compiler quiet.
*/
npages = 0;
startidx = 0;
startoffset = 0;
/*
* Within the loop, curridx is the cache block index of the page to
* consider writing. Begin at the buffer containing the next unwritten
* page, or last partially written page.
*/
curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
Use O_DIRECT if available when using O_SYNC for wal_sync_method. Also, write multiple WAL buffers out in one write() operation. ITAGAKI Takahiro --------------------------------------------------------------------------- > If we disable writeback-cache and use open_sync, the per-page writing > behavior in WAL module will show up as bad result. O_DIRECT is similar > to O_DSYNC (at least on linux), so that the benefit of it will disappear > behind the slow disk revolution. > > In the current source, WAL is written as: > for (i = 0; i < N; i++) { write(&buffers[i], BLCKSZ); } > Is this intentional? Can we rewrite it as follows? > write(&buffers[0], N * BLCKSZ); > > In order to achieve it, I wrote a 'gather-write' patch (xlog.gw.diff). > Aside from this, I'll also send the fixed direct io patch (xlog.dio.diff). > These two patches are independent, so they can be applied either or both. > > > I tested them on my machine and the results as follows. It shows that > direct-io and gather-write is the best choice when writeback-cache is off. > Are these two patches worth trying if they are used together? > > > | writeback | fsync= | fdata | open_ | fsync_ | open_ > patch | cache | false | sync | sync | direct | direct > ------------+-----------+--------+-------+-------+--------+--------- > direct io | off | 124.2 | 105.7 | 48.3 | 48.3 | 48.2 > direct io | on | 129.1 | 112.3 | 114.1 | 142.9 | 144.5 > gather-write| off | 124.3 | 108.7 | 105.4 | (N/A) | (N/A) > both | off | 131.5 | 115.5 | 114.4 | 145.4 | 145.2 > > - 20runs * pgbench -s 100 -c 50 -t 200 > - with tuning (wal_buffers=64, commit_delay=500, checkpoint_segments=8) > - using 2 ATA disks: > - hda(reiserfs) includes system and wal. > - hdc(jfs) includes database files. writeback-cache is always on. > > --- > ITAGAKI Takahiro
2005-07-29 05:22:33 +02:00
while (LogwrtResult.Write < WriteRqst.Write)
{
/*
* Make sure we're not ahead of the insert process. This could happen
* if we're passed a bogus WriteRqst.Write that is past the end of the
* last page that's been initialized by AdvanceXLInsertBuffer.
*/
XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
if (LogwrtResult.Write >= EndPtr)
elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
LSN_FORMAT_ARGS(LogwrtResult.Write),
LSN_FORMAT_ARGS(EndPtr));
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/* Advance LogwrtResult.Write to end of current buffer page */
LogwrtResult.Write = EndPtr;
ispartialpage = WriteRqst.Write < LogwrtResult.Write;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
wal_segment_size))
{
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Switch to new logfile segment. We cannot have any pending
* pages here (since we dump what we have at segment end).
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
Assert(npages == 0);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
if (openLogFile >= 0)
XLogFileClose();
XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
wal_segment_size);
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
openLogTLI = tli;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/* create/use new log file */
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
openLogFile = XLogFileInit(openLogSegNo, tli);
Account explicitly for long-lived FDs that are allocated outside fd.c. The comments in fd.c have long claimed that all file allocations should go through that module, but in reality that's not always practical. fd.c doesn't supply APIs for invoking some FD-producing syscalls like pipe() or epoll_create(); and the APIs it does supply for non-virtual FDs are mostly insistent on releasing those FDs at transaction end; and in some cases the actual open() call is in code that can't be made to use fd.c, such as libpq. This has led to a situation where, in a modern server, there are likely to be seven or so long-lived FDs per backend process that are not known to fd.c. Since NUM_RESERVED_FDS is only 10, that meant we had *very* few spare FDs if max_files_per_process is >= the system ulimit and fd.c had opened all the files it thought it safely could. The contrib/postgres_fdw regression test, in particular, could easily be made to fall over by running it under a restrictive ulimit. To improve matters, invent functions Acquire/Reserve/ReleaseExternalFD that allow outside callers to tell fd.c that they have or want to allocate a FD that's not directly managed by fd.c. Add calls to track all the fixed FDs in a standard backend session, so that we are honestly guaranteeing that NUM_RESERVED_FDS FDs remain unused below the EMFILE limit in a backend's idle state. The coding rules for these functions say that there's no need to call them in code that just allocates one FD over a fairly short interval; we can dip into NUM_RESERVED_FDS for such cases. That means that there aren't all that many places where we need to worry. But postgres_fdw and dblink must use this facility to account for long-lived FDs consumed by libpq connections. There may be other places where it's worth doing such accounting, too, but this seems like enough to solve the immediate problem. Internally to fd.c, "external" FDs are limited to max_safe_fds/3 FDs. (Callers can choose to ignore this limit, but of course it's unwise to do so except for fixed file allocations.) I also reduced the limit on "allocated" files to max_safe_fds/3 FDs (it had been max_safe_fds/2). Conceivably a smarter rule could be used here --- but in practice, on reasonable systems, max_safe_fds should be large enough that this isn't much of an issue, so KISS for now. To avoid possible regression in the number of external or allocated files that can be opened, increase FD_MINFREE and the lower limit on max_files_per_process a little bit; we now insist that the effective "ulimit -n" be at least 64. This seems like pretty clearly a bug fix, but in view of the lack of field complaints, I'll refrain from risking a back-patch. Discussion: https://postgr.es/m/E1izCmM-0005pV-Co@gemulon.postgresql.org
2020-02-24 23:28:33 +01:00
ReserveExternalFD();
}
/* Make sure we have the current logfile open */
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
if (openLogFile < 0)
{
XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
wal_segment_size);
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
openLogTLI = tli;
openLogFile = XLogFileOpen(openLogSegNo, tli);
Account explicitly for long-lived FDs that are allocated outside fd.c. The comments in fd.c have long claimed that all file allocations should go through that module, but in reality that's not always practical. fd.c doesn't supply APIs for invoking some FD-producing syscalls like pipe() or epoll_create(); and the APIs it does supply for non-virtual FDs are mostly insistent on releasing those FDs at transaction end; and in some cases the actual open() call is in code that can't be made to use fd.c, such as libpq. This has led to a situation where, in a modern server, there are likely to be seven or so long-lived FDs per backend process that are not known to fd.c. Since NUM_RESERVED_FDS is only 10, that meant we had *very* few spare FDs if max_files_per_process is >= the system ulimit and fd.c had opened all the files it thought it safely could. The contrib/postgres_fdw regression test, in particular, could easily be made to fall over by running it under a restrictive ulimit. To improve matters, invent functions Acquire/Reserve/ReleaseExternalFD that allow outside callers to tell fd.c that they have or want to allocate a FD that's not directly managed by fd.c. Add calls to track all the fixed FDs in a standard backend session, so that we are honestly guaranteeing that NUM_RESERVED_FDS FDs remain unused below the EMFILE limit in a backend's idle state. The coding rules for these functions say that there's no need to call them in code that just allocates one FD over a fairly short interval; we can dip into NUM_RESERVED_FDS for such cases. That means that there aren't all that many places where we need to worry. But postgres_fdw and dblink must use this facility to account for long-lived FDs consumed by libpq connections. There may be other places where it's worth doing such accounting, too, but this seems like enough to solve the immediate problem. Internally to fd.c, "external" FDs are limited to max_safe_fds/3 FDs. (Callers can choose to ignore this limit, but of course it's unwise to do so except for fixed file allocations.) I also reduced the limit on "allocated" files to max_safe_fds/3 FDs (it had been max_safe_fds/2). Conceivably a smarter rule could be used here --- but in practice, on reasonable systems, max_safe_fds should be large enough that this isn't much of an issue, so KISS for now. To avoid possible regression in the number of external or allocated files that can be opened, increase FD_MINFREE and the lower limit on max_files_per_process a little bit; we now insist that the effective "ulimit -n" be at least 64. This seems like pretty clearly a bug fix, but in view of the lack of field complaints, I'll refrain from risking a back-patch. Discussion: https://postgr.es/m/E1izCmM-0005pV-Co@gemulon.postgresql.org
2020-02-24 23:28:33 +01:00
ReserveExternalFD();
}
/* Add current page to the set of pending pages-to-dump */
if (npages == 0)
{
/* first of group */
startidx = curridx;
startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
wal_segment_size);
}
npages++;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Dump the set if this will be the last loop iteration, or if we are
* at the last page of the cache area (since the next page won't be
* contiguous in memory), or if we are at the end of the logfile
* segment.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
last_iteration = WriteRqst.Write <= LogwrtResult.Write;
finishing_seg = !ispartialpage &&
(startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
if (last_iteration ||
curridx == XLogCtl->XLogCacheBlck ||
finishing_seg)
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
{
char *from;
Size nbytes;
Size nleft;
int written;
instr_time start;
/* OK to write the page(s) */
from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
nbytes = npages * (Size) XLOG_BLCKSZ;
nleft = nbytes;
do
{
errno = 0;
/* Measure I/O timing to write WAL data */
if (track_wal_io_timing)
INSTR_TIME_SET_CURRENT(start);
pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
written = pg_pwrite(openLogFile, from, nleft, startoffset);
pgstat_report_wait_end();
/*
* Increment the I/O timing and the number of times WAL data
* were written out to disk.
*/
if (track_wal_io_timing)
{
instr_time duration;
INSTR_TIME_SET_CURRENT(duration);
INSTR_TIME_SUBTRACT(duration, start);
WalStats.m_wal_write_time += INSTR_TIME_GET_MICROSEC(duration);
}
WalStats.m_wal_write++;
if (written <= 0)
{
char xlogfname[MAXFNAMELEN];
int save_errno;
if (errno == EINTR)
continue;
save_errno = errno;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogFileName(xlogfname, tli, openLogSegNo,
wal_segment_size);
errno = save_errno;
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not write to log file %s "
"at offset %u, length %zu: %m",
xlogfname, startoffset, nleft)));
}
nleft -= written;
from += written;
startoffset += written;
} while (nleft > 0);
npages = 0;
/*
* If we just wrote the whole last page of a logfile segment,
* fsync the segment immediately. This avoids having to go back
* and re-open prior segments when an fsync request comes along
* later. Doing it here ensures that one and only one backend will
* perform this fsync.
*
* This is also the right place to notify the Archiver that the
* segment is ready to copy to archival storage, and to update the
* timer for archive_timeout, and to signal for a checkpoint if
* too many logfile segments have been used since the last
* checkpoint.
*/
if (finishing_seg)
{
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
issue_xlog_fsync(openLogFile, openLogSegNo, tli);
/* signal that we need to wakeup walsenders later */
WalSndWakeupRequest();
LogwrtResult.Flush = LogwrtResult.Write; /* end of page */
if (XLogArchivingActive())
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogArchiveNotifySeg(openLogSegNo, tli);
XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
/*
* Request a checkpoint if we've consumed too much xlog since
* the last one. For speed, we first check using the local
* copy of RedoRecPtr, which might be out of date; if it looks
* like a checkpoint is needed, forcibly update RedoRecPtr and
* recheck.
*/
if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
{
(void) GetRedoRecPtr();
if (XLogCheckpointNeeded(openLogSegNo))
RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
}
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
if (ispartialpage)
{
/* Only asked to write a partial page */
LogwrtResult.Write = WriteRqst.Write;
break;
}
curridx = NextBufIdx(curridx);
/* If flexible, break out of loop as soon as we wrote something */
if (flexible && npages == 0)
break;
}
Assert(npages == 0);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* If asked to flush, do so
*/
if (LogwrtResult.Flush < WriteRqst.Flush &&
LogwrtResult.Flush < LogwrtResult.Write)
{
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Could get here without iterating above loop, in which case we might
* have no open file or the wrong one. However, we do not need to
* fsync more than one file.
*/
if (sync_method != SYNC_METHOD_OPEN &&
sync_method != SYNC_METHOD_OPEN_DSYNC)
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
{
if (openLogFile >= 0 &&
!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
wal_segment_size))
XLogFileClose();
if (openLogFile < 0)
{
XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
wal_segment_size);
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
openLogTLI = tli;
openLogFile = XLogFileOpen(openLogSegNo, tli);
Account explicitly for long-lived FDs that are allocated outside fd.c. The comments in fd.c have long claimed that all file allocations should go through that module, but in reality that's not always practical. fd.c doesn't supply APIs for invoking some FD-producing syscalls like pipe() or epoll_create(); and the APIs it does supply for non-virtual FDs are mostly insistent on releasing those FDs at transaction end; and in some cases the actual open() call is in code that can't be made to use fd.c, such as libpq. This has led to a situation where, in a modern server, there are likely to be seven or so long-lived FDs per backend process that are not known to fd.c. Since NUM_RESERVED_FDS is only 10, that meant we had *very* few spare FDs if max_files_per_process is >= the system ulimit and fd.c had opened all the files it thought it safely could. The contrib/postgres_fdw regression test, in particular, could easily be made to fall over by running it under a restrictive ulimit. To improve matters, invent functions Acquire/Reserve/ReleaseExternalFD that allow outside callers to tell fd.c that they have or want to allocate a FD that's not directly managed by fd.c. Add calls to track all the fixed FDs in a standard backend session, so that we are honestly guaranteeing that NUM_RESERVED_FDS FDs remain unused below the EMFILE limit in a backend's idle state. The coding rules for these functions say that there's no need to call them in code that just allocates one FD over a fairly short interval; we can dip into NUM_RESERVED_FDS for such cases. That means that there aren't all that many places where we need to worry. But postgres_fdw and dblink must use this facility to account for long-lived FDs consumed by libpq connections. There may be other places where it's worth doing such accounting, too, but this seems like enough to solve the immediate problem. Internally to fd.c, "external" FDs are limited to max_safe_fds/3 FDs. (Callers can choose to ignore this limit, but of course it's unwise to do so except for fixed file allocations.) I also reduced the limit on "allocated" files to max_safe_fds/3 FDs (it had been max_safe_fds/2). Conceivably a smarter rule could be used here --- but in practice, on reasonable systems, max_safe_fds should be large enough that this isn't much of an issue, so KISS for now. To avoid possible regression in the number of external or allocated files that can be opened, increase FD_MINFREE and the lower limit on max_files_per_process a little bit; we now insist that the effective "ulimit -n" be at least 64. This seems like pretty clearly a bug fix, but in view of the lack of field complaints, I'll refrain from risking a back-patch. Discussion: https://postgr.es/m/E1izCmM-0005pV-Co@gemulon.postgresql.org
2020-02-24 23:28:33 +01:00
ReserveExternalFD();
}
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
issue_xlog_fsync(openLogFile, openLogSegNo, tli);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
}
/* signal that we need to wakeup walsenders later */
WalSndWakeupRequest();
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
LogwrtResult.Flush = LogwrtResult.Write;
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Update shared-memory status
*
* We make sure that the shared 'request' values do not fall behind the
* 'result' values. This is not absolutely essential, but it saves some
* code in a couple of places.
*/
{
SpinLockAcquire(&XLogCtl->info_lck);
XLogCtl->LogwrtResult = LogwrtResult;
if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
SpinLockRelease(&XLogCtl->info_lck);
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
}
/*
* Record the LSN for an asynchronous transaction commit/abort
* and nudge the WALWriter if there is work for it to do.
* (This should not be called for synchronous commits.)
*/
void
XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
{
XLogRecPtr WriteRqstPtr = asyncXactLSN;
bool sleeping;
SpinLockAcquire(&XLogCtl->info_lck);
LogwrtResult = XLogCtl->LogwrtResult;
sleeping = XLogCtl->WalWriterSleeping;
if (XLogCtl->asyncXactLSN < asyncXactLSN)
XLogCtl->asyncXactLSN = asyncXactLSN;
SpinLockRelease(&XLogCtl->info_lck);
/*
* If the WALWriter is sleeping, we should kick it to make it come out of
* low-power mode. Otherwise, determine whether there's a full page of
* WAL available to write.
*/
if (!sleeping)
{
/* back off to last completed page boundary */
WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
/* if we have already flushed that far, we're done */
if (WriteRqstPtr <= LogwrtResult.Flush)
return;
}
/*
* Nudge the WALWriter: it has a full page of WAL to write, or we want it
* to come out of low-power mode so that this async commit will reach disk
* within the expected amount of time.
*/
if (ProcGlobal->walwriterLatch)
SetLatch(ProcGlobal->walwriterLatch);
}
/*
* Record the LSN up to which we can remove WAL because it's not required by
* any replication slot.
*/
void
XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
{
SpinLockAcquire(&XLogCtl->info_lck);
XLogCtl->replicationSlotMinLSN = lsn;
SpinLockRelease(&XLogCtl->info_lck);
}
/*
* Return the oldest LSN we must retain to satisfy the needs of some
* replication slot.
*/
static XLogRecPtr
XLogGetReplicationSlotMinimumLSN(void)
{
XLogRecPtr retval;
SpinLockAcquire(&XLogCtl->info_lck);
retval = XLogCtl->replicationSlotMinLSN;
SpinLockRelease(&XLogCtl->info_lck);
return retval;
}
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/*
* Advance minRecoveryPoint in control file.
*
* If we crash during recovery, we must reach this point again before the
* database is consistent.
*
* If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
* is only updated if it's not already greater than or equal to 'lsn'.
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
*/
static void
UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
{
/* Quick check using our local copy of the variable */
if (!updateMinRecoveryPoint || (!force && lsn <= LocalMinRecoveryPoint))
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
return;
Prevent references to invalid relation pages after fresh promotion If a standby crashes after promotion before having completed its first post-recovery checkpoint, then the minimal recovery point which marks the LSN position where the cluster is able to reach consistency may be set to a position older than the first end-of-recovery checkpoint while all the WAL available should be replayed. This leads to the instance thinking that it contains inconsistent pages, causing a PANIC and a hard instance crash even if all the WAL available has not been replayed for certain sets of records replayed. When in crash recovery, minRecoveryPoint is expected to always be set to InvalidXLogRecPtr, which forces the recovery to replay all the WAL available, so this commit makes sure that the local copy of minRecoveryPoint from the control file is initialized properly and stays as it is while crash recovery is performed. Once switching to archive recovery or if crash recovery finishes, then the local copy minRecoveryPoint can be safely updated. Pavan Deolasee has reported and diagnosed the failure in the first place, and the base fix idea to rely on the local copy of minRecoveryPoint comes from Kyotaro Horiguchi, which has been expanded into a full-fledged patch by me. The test included in this commit has been written by Álvaro Herrera and Pavan Deolasee, which I have modified to make it faster and more reliable with sleep phases. Backpatch down to all supported versions where the bug appears, aka 9.3 which is where the end-of-recovery checkpoint is not run by the startup process anymore. The test gets easily supported down to 10, still it has been tested on all branches. Reported-by: Pavan Deolasee Diagnosed-by: Pavan Deolasee Reviewed-by: Pavan Deolasee, Kyotaro Horiguchi Author: Michael Paquier, Kyotaro Horiguchi, Pavan Deolasee, Álvaro Herrera Discussion: https://postgr.es/m/CABOikdPOewjNL=05K5CbNMxnNtXnQjhTx2F--4p4ruorCjukbA@mail.gmail.com
2018-07-05 03:46:18 +02:00
/*
* An invalid minRecoveryPoint means that we need to recover all the WAL,
* i.e., we're doing crash recovery. We never modify the control file's
* value in that case, so we can short-circuit future checks here too. The
* local values of minRecoveryPoint and minRecoveryPointTLI should not be
Ensure correct minimum consistent point on standbys Startup process has improved its calculation of incorrect minimum consistent point in 8d68ee6, which ensures that all WAL available gets replayed when doing crash recovery, and has introduced an incorrect calculation of the minimum recovery point for non-startup processes, which can cause incorrect page references on a standby when for example the background writer flushed a couple of pages on-disk but was not updating the control file to let a subsequent crash recovery replay to where it should have. The only case where this has been reported to be a problem is when a standby needs to calculate the latest removed xid when replaying a btree deletion record, so one would need connections on a standby that happen just after recovery has thought it reached a consistent point. Using a background worker which is started after the consistent point is reached would be the easiest way to get into problems if it connects to a database. Having clients which attempt to connect periodically could also be a problem, but the odds of seeing this problem are much lower. The fix used is pretty simple, as the idea is to give access to the minimum recovery point written in the control file to non-startup processes so as they use a reference, while the startup process still initializes its own references of the minimum consistent point so as the original problem with incorrect page references happening post-promotion with a crash do not show up. Reported-by: Alexander Kukushkin Diagnosed-by: Alexander Kukushkin Author: Michael Paquier Reviewed-by: Kyotaro Horiguchi, Alexander Kukushkin Discussion: https://postgr.es/m/153492341830.1368.3936905691758473953@wrigleys.postgresql.org Backpatch-through: 9.3
2018-08-31 20:03:40 +02:00
* updated until crash recovery finishes. We only do this for the startup
* process as it should not update its own reference of minRecoveryPoint
* until it has finished crash recovery to make sure that all WAL
* available is replayed in this case. This also saves from extra locks
* taken on the control file from the startup process.
Prevent references to invalid relation pages after fresh promotion If a standby crashes after promotion before having completed its first post-recovery checkpoint, then the minimal recovery point which marks the LSN position where the cluster is able to reach consistency may be set to a position older than the first end-of-recovery checkpoint while all the WAL available should be replayed. This leads to the instance thinking that it contains inconsistent pages, causing a PANIC and a hard instance crash even if all the WAL available has not been replayed for certain sets of records replayed. When in crash recovery, minRecoveryPoint is expected to always be set to InvalidXLogRecPtr, which forces the recovery to replay all the WAL available, so this commit makes sure that the local copy of minRecoveryPoint from the control file is initialized properly and stays as it is while crash recovery is performed. Once switching to archive recovery or if crash recovery finishes, then the local copy minRecoveryPoint can be safely updated. Pavan Deolasee has reported and diagnosed the failure in the first place, and the base fix idea to rely on the local copy of minRecoveryPoint comes from Kyotaro Horiguchi, which has been expanded into a full-fledged patch by me. The test included in this commit has been written by Álvaro Herrera and Pavan Deolasee, which I have modified to make it faster and more reliable with sleep phases. Backpatch down to all supported versions where the bug appears, aka 9.3 which is where the end-of-recovery checkpoint is not run by the startup process anymore. The test gets easily supported down to 10, still it has been tested on all branches. Reported-by: Pavan Deolasee Diagnosed-by: Pavan Deolasee Reviewed-by: Pavan Deolasee, Kyotaro Horiguchi Author: Michael Paquier, Kyotaro Horiguchi, Pavan Deolasee, Álvaro Herrera Discussion: https://postgr.es/m/CABOikdPOewjNL=05K5CbNMxnNtXnQjhTx2F--4p4ruorCjukbA@mail.gmail.com
2018-07-05 03:46:18 +02:00
*/
if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
Prevent references to invalid relation pages after fresh promotion If a standby crashes after promotion before having completed its first post-recovery checkpoint, then the minimal recovery point which marks the LSN position where the cluster is able to reach consistency may be set to a position older than the first end-of-recovery checkpoint while all the WAL available should be replayed. This leads to the instance thinking that it contains inconsistent pages, causing a PANIC and a hard instance crash even if all the WAL available has not been replayed for certain sets of records replayed. When in crash recovery, minRecoveryPoint is expected to always be set to InvalidXLogRecPtr, which forces the recovery to replay all the WAL available, so this commit makes sure that the local copy of minRecoveryPoint from the control file is initialized properly and stays as it is while crash recovery is performed. Once switching to archive recovery or if crash recovery finishes, then the local copy minRecoveryPoint can be safely updated. Pavan Deolasee has reported and diagnosed the failure in the first place, and the base fix idea to rely on the local copy of minRecoveryPoint comes from Kyotaro Horiguchi, which has been expanded into a full-fledged patch by me. The test included in this commit has been written by Álvaro Herrera and Pavan Deolasee, which I have modified to make it faster and more reliable with sleep phases. Backpatch down to all supported versions where the bug appears, aka 9.3 which is where the end-of-recovery checkpoint is not run by the startup process anymore. The test gets easily supported down to 10, still it has been tested on all branches. Reported-by: Pavan Deolasee Diagnosed-by: Pavan Deolasee Reviewed-by: Pavan Deolasee, Kyotaro Horiguchi Author: Michael Paquier, Kyotaro Horiguchi, Pavan Deolasee, Álvaro Herrera Discussion: https://postgr.es/m/CABOikdPOewjNL=05K5CbNMxnNtXnQjhTx2F--4p4ruorCjukbA@mail.gmail.com
2018-07-05 03:46:18 +02:00
{
updateMinRecoveryPoint = false;
return;
}
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
/* update local copy */
LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
Ensure correct minimum consistent point on standbys Startup process has improved its calculation of incorrect minimum consistent point in 8d68ee6, which ensures that all WAL available gets replayed when doing crash recovery, and has introduced an incorrect calculation of the minimum recovery point for non-startup processes, which can cause incorrect page references on a standby when for example the background writer flushed a couple of pages on-disk but was not updating the control file to let a subsequent crash recovery replay to where it should have. The only case where this has been reported to be a problem is when a standby needs to calculate the latest removed xid when replaying a btree deletion record, so one would need connections on a standby that happen just after recovery has thought it reached a consistent point. Using a background worker which is started after the consistent point is reached would be the easiest way to get into problems if it connects to a database. Having clients which attempt to connect periodically could also be a problem, but the odds of seeing this problem are much lower. The fix used is pretty simple, as the idea is to give access to the minimum recovery point written in the control file to non-startup processes so as they use a reference, while the startup process still initializes its own references of the minimum consistent point so as the original problem with incorrect page references happening post-promotion with a crash do not show up. Reported-by: Alexander Kukushkin Diagnosed-by: Alexander Kukushkin Author: Michael Paquier Reviewed-by: Kyotaro Horiguchi, Alexander Kukushkin Discussion: https://postgr.es/m/153492341830.1368.3936905691758473953@wrigleys.postgresql.org Backpatch-through: 9.3
2018-08-31 20:03:40 +02:00
updateMinRecoveryPoint = false;
else if (force || LocalMinRecoveryPoint < lsn)
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
{
XLogRecPtr newMinRecoveryPoint;
TimeLineID newMinRecoveryPointTLI;
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/*
* To avoid having to update the control file too often, we update it
* all the way to the last record being replayed, even though 'lsn'
* would suffice for correctness. This also allows the 'force' case
* to not need a valid 'lsn' value.
*
* Another important reason for doing it this way is that the passed
* 'lsn' value could be bogus, i.e., past the end of available WAL, if
* the caller got it from a corrupted heap page. Accepting such a
* value as the min recovery point would prevent us from coming up at
* all. Instead, we just log a warning and continue with recovery.
* (See also the comments about corrupt LSNs in XLogFlush.)
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
*/
newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI);
if (!force && newMinRecoveryPoint < lsn)
elog(WARNING,
"xlog min recovery request %X/%X is past current point %X/%X",
LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint));
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/* update control file */
if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
{
ControlFile->minRecoveryPoint = newMinRecoveryPoint;
ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
UpdateControlFile();
LocalMinRecoveryPoint = newMinRecoveryPoint;
LocalMinRecoveryPointTLI = newMinRecoveryPointTLI;
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
ereport(DEBUG2,
(errmsg_internal("updated min recovery point to %X/%X on timeline %u",
LSN_FORMAT_ARGS(newMinRecoveryPoint),
newMinRecoveryPointTLI)));
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
}
}
LWLockRelease(ControlFileLock);
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Ensure that all XLOG data through the given position is flushed to disk.
*
* NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
* already held, and we try to avoid acquiring it if possible.
*/
void
XLogFlush(XLogRecPtr record)
{
XLogRecPtr WriteRqstPtr;
XLogwrtRqst WriteRqst;
TimeLineID insertTLI = XLogCtl->InsertTimeLineID;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/*
* During REDO, we are reading not writing WAL. Therefore, instead of
* trying to flush the WAL, we should update minRecoveryPoint instead. We
* test XLogInsertAllowed(), not InRecovery, because we need checkpointer
* to act this way too, and because when it tries to write the
* end-of-recovery checkpoint, it should indeed flush.
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
*/
if (!XLogInsertAllowed())
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
{
UpdateMinRecoveryPoint(record, false);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
return;
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/* Quick exit if already known flushed */
if (record <= LogwrtResult.Flush)
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
return;
#ifdef WAL_DEBUG
if (XLOG_DEBUG)
elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
LSN_FORMAT_ARGS(record),
LSN_FORMAT_ARGS(LogwrtResult.Write),
LSN_FORMAT_ARGS(LogwrtResult.Flush));
#endif
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
START_CRIT_SECTION();
/*
* Since fsync is usually a horribly expensive operation, we try to
* piggyback as much data as we can on each fsync: if we see any more data
* entered into the xlog buffer, we'll write and fsync that too, so that
* the final value of LogwrtResult.Flush is as large as possible. This
* gives us some chance of avoiding another fsync immediately after.
*/
/* initialize to given target; may increase below */
WriteRqstPtr = record;
/*
* Now wait until we get the write lock, or someone else does the flush
* for us.
*/
for (;;)
{
XLogRecPtr insertpos;
/* read LogwrtResult and update local state */
SpinLockAcquire(&XLogCtl->info_lck);
if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
LogwrtResult = XLogCtl->LogwrtResult;
SpinLockRelease(&XLogCtl->info_lck);
/* done already? */
if (record <= LogwrtResult.Flush)
break;
/*
* Before actually performing the write, wait for all in-flight
* insertions to the pages we're about to write to finish.
*/
insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
/*
* Try to get the write lock. If we can't get it immediately, wait
* until it's released, and recheck if we still need to do the flush
* or if the backend that held the lock did it for us already. This
* helps to maintain a good rate of group committing when the system
* is bottlenecked by the speed of fsyncing.
*/
if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
{
/*
* The lock is now free, but we didn't acquire it yet. Before we
* do, loop back to check if someone else flushed the record for
* us already.
*/
continue;
}
/* Got the lock; recheck whether request is satisfied */
LogwrtResult = XLogCtl->LogwrtResult;
if (record <= LogwrtResult.Flush)
{
LWLockRelease(WALWriteLock);
break;
}
/*
* Sleep before flush! By adding a delay here, we may give further
* backends the opportunity to join the backlog of group commit
* followers; this can significantly improve transaction throughput,
* at the risk of increasing transaction latency.
*
* We do not sleep if enableFsync is not turned on, nor if there are
* fewer than CommitSiblings other backends with active transactions.
*/
if (CommitDelay > 0 && enableFsync &&
MinimumActiveBackends(CommitSiblings))
{
pg_usleep(CommitDelay);
/*
* Re-check how far we can now flush the WAL. It's generally not
2015-05-20 00:37:46 +02:00
* safe to call WaitXLogInsertionsToFinish while holding
* WALWriteLock, because an in-progress insertion might need to
* also grab WALWriteLock to make progress. But we know that all
* the insertions up to insertpos have already finished, because
* that's what the earlier WaitXLogInsertionsToFinish() returned.
* We're only calling it again to allow insertpos to be moved
* further forward, not to actually wait for anyone.
*/
insertpos = WaitXLogInsertionsToFinish(insertpos);
}
/* try to write/flush later additions to XLOG as well */
WriteRqst.Write = insertpos;
WriteRqst.Flush = insertpos;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogWrite(WriteRqst, insertTLI, false);
LWLockRelease(WALWriteLock);
/* done */
break;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
}
END_CRIT_SECTION();
/* wake up walsenders now that we've released heavily contended locks */
WalSndWakeupProcessRequests();
/*
* If we still haven't flushed to the request point then we have a
* problem; most likely, the requested flush point is past end of XLOG.
* This has been seen to occur when a disk page has a corrupted LSN.
*
* Formerly we treated this as a PANIC condition, but that hurts the
* system's robustness rather than helping it: we do not want to take down
* the whole system due to corruption on one data page. In particular, if
* the bad page is encountered again during recovery then we would be
* unable to restart the database at all! (This scenario actually
* happened in the field several times with 7.1 releases.) As of 8.4, bad
* LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
* the only time we can reach here during recovery is while flushing the
* end-of-recovery checkpoint record, and we don't expect that to have a
* bad LSN.
*
* Note that for calls from xact.c, the ERROR will be promoted to PANIC
* since xact.c calls this routine inside a critical section. However,
* calls from bufmgr.c are not within critical sections and so we will not
* force a restart for a bad LSN on a data page.
*/
if (LogwrtResult.Flush < record)
elog(ERROR,
"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
LSN_FORMAT_ARGS(record),
LSN_FORMAT_ARGS(LogwrtResult.Flush));
}
/*
* Write & flush xlog, but without specifying exactly where to.
*
* We normally write only completed blocks; but if there is nothing to do on
* that basis, we check for unwritten async commits in the current incomplete
* block, and write through the latest one of those. Thus, if async commits
* are not being used, we will write complete blocks only.
*
* If, based on the above, there's anything to write we do so immediately. But
* to avoid calling fsync, fdatasync et. al. at a rate that'd impact
* concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
* more than wal_writer_flush_after unflushed blocks.
*
* We can guarantee that async commits reach disk after at most three
* wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
* to write "flexibly", meaning it can stop at the end of the buffer ring;
* this makes a difference only with very high load or long wal_writer_delay,
* but imposes one extra cycle for the worst case for async commits.)
*
* This routine is invoked periodically by the background walwriter process.
*
* Returns true if there was any work to do, even if we skipped flushing due
* to wal_writer_delay/wal_writer_flush_after.
*/
bool
XLogBackgroundFlush(void)
{
XLogwrtRqst WriteRqst;
bool flexible = true;
static TimestampTz lastflush;
TimestampTz now;
int flushbytes;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
TimeLineID insertTLI;
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/* XLOG doesn't need flushing during recovery */
if (RecoveryInProgress())
return false;
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
/*
* Since we're not in recovery, InsertTimeLineID is set and can't change,
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
* so we can read it without a lock.
*/
insertTLI = XLogCtl->InsertTimeLineID;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
/* read LogwrtResult and update local state */
SpinLockAcquire(&XLogCtl->info_lck);
LogwrtResult = XLogCtl->LogwrtResult;
WriteRqst = XLogCtl->LogwrtRqst;
SpinLockRelease(&XLogCtl->info_lck);
/* back off to last completed page boundary */
WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
/* if we have already flushed that far, consider async commit records */
if (WriteRqst.Write <= LogwrtResult.Flush)
{
SpinLockAcquire(&XLogCtl->info_lck);
WriteRqst.Write = XLogCtl->asyncXactLSN;
SpinLockRelease(&XLogCtl->info_lck);
flexible = false; /* ensure it all gets written */
}
/*
* If already known flushed, we're done. Just need to check if we are
* holding an open file handle to a logfile that's no longer in use,
* preventing the file from being deleted.
*/
if (WriteRqst.Write <= LogwrtResult.Flush)
{
if (openLogFile >= 0)
{
if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
wal_segment_size))
{
XLogFileClose();
}
}
return false;
}
/*
* Determine how far to flush WAL, based on the wal_writer_delay and
* wal_writer_flush_after GUCs.
*/
now = GetCurrentTimestamp();
flushbytes =
WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
if (WalWriterFlushAfter == 0 || lastflush == 0)
{
/* first call, or block based limits disabled */
WriteRqst.Flush = WriteRqst.Write;
lastflush = now;
}
else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
{
/*
* Flush the writes at least every WalWriterDelay ms. This is
* important to bound the amount of time it takes for an asynchronous
* commit to hit disk.
*/
WriteRqst.Flush = WriteRqst.Write;
lastflush = now;
}
else if (flushbytes >= WalWriterFlushAfter)
{
/* exceeded wal_writer_flush_after blocks, flush */
WriteRqst.Flush = WriteRqst.Write;
lastflush = now;
}
else
{
/* no flushing, this time round */
WriteRqst.Flush = 0;
}
#ifdef WAL_DEBUG
if (XLOG_DEBUG)
elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
LSN_FORMAT_ARGS(WriteRqst.Write),
LSN_FORMAT_ARGS(WriteRqst.Flush),
LSN_FORMAT_ARGS(LogwrtResult.Write),
LSN_FORMAT_ARGS(LogwrtResult.Flush));
#endif
START_CRIT_SECTION();
/* now wait for any in-progress insertions to finish and get write lock */
WaitXLogInsertionsToFinish(WriteRqst.Write);
LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
LogwrtResult = XLogCtl->LogwrtResult;
if (WriteRqst.Write > LogwrtResult.Write ||
WriteRqst.Flush > LogwrtResult.Flush)
{
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogWrite(WriteRqst, insertTLI, flexible);
}
LWLockRelease(WALWriteLock);
END_CRIT_SECTION();
/* wake up walsenders now that we've released heavily contended locks */
WalSndWakeupProcessRequests();
/*
* Great, done. To take some work off the critical path, try to initialize
* as many of the no-longer-needed WAL buffers for future use as we can.
*/
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
AdvanceXLInsertBuffer(InvalidXLogRecPtr, insertTLI, true);
/*
* If we determined that we need to write data, but somebody else
* wrote/flushed already, it should be considered as being active, to
* avoid hibernating too early.
*/
return true;
}
/*
* Test whether XLOG data has been flushed up to (at least) the given position.
*
* Returns true if a flush is still needed. (It may be that someone else
* is already in process of flushing that far, however.)
*/
bool
XLogNeedsFlush(XLogRecPtr record)
{
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
/*
* During recovery, we don't flush WAL but update minRecoveryPoint
* instead. So "needs flush" is taken to mean whether minRecoveryPoint
* would need to be updated.
*/
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
if (RecoveryInProgress())
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
{
Prevent references to invalid relation pages after fresh promotion If a standby crashes after promotion before having completed its first post-recovery checkpoint, then the minimal recovery point which marks the LSN position where the cluster is able to reach consistency may be set to a position older than the first end-of-recovery checkpoint while all the WAL available should be replayed. This leads to the instance thinking that it contains inconsistent pages, causing a PANIC and a hard instance crash even if all the WAL available has not been replayed for certain sets of records replayed. When in crash recovery, minRecoveryPoint is expected to always be set to InvalidXLogRecPtr, which forces the recovery to replay all the WAL available, so this commit makes sure that the local copy of minRecoveryPoint from the control file is initialized properly and stays as it is while crash recovery is performed. Once switching to archive recovery or if crash recovery finishes, then the local copy minRecoveryPoint can be safely updated. Pavan Deolasee has reported and diagnosed the failure in the first place, and the base fix idea to rely on the local copy of minRecoveryPoint comes from Kyotaro Horiguchi, which has been expanded into a full-fledged patch by me. The test included in this commit has been written by Álvaro Herrera and Pavan Deolasee, which I have modified to make it faster and more reliable with sleep phases. Backpatch down to all supported versions where the bug appears, aka 9.3 which is where the end-of-recovery checkpoint is not run by the startup process anymore. The test gets easily supported down to 10, still it has been tested on all branches. Reported-by: Pavan Deolasee Diagnosed-by: Pavan Deolasee Reviewed-by: Pavan Deolasee, Kyotaro Horiguchi Author: Michael Paquier, Kyotaro Horiguchi, Pavan Deolasee, Álvaro Herrera Discussion: https://postgr.es/m/CABOikdPOewjNL=05K5CbNMxnNtXnQjhTx2F--4p4ruorCjukbA@mail.gmail.com
2018-07-05 03:46:18 +02:00
/*
* An invalid minRecoveryPoint means that we need to recover all the
* WAL, i.e., we're doing crash recovery. We never modify the control
* file's value in that case, so we can short-circuit future checks
Ensure correct minimum consistent point on standbys Startup process has improved its calculation of incorrect minimum consistent point in 8d68ee6, which ensures that all WAL available gets replayed when doing crash recovery, and has introduced an incorrect calculation of the minimum recovery point for non-startup processes, which can cause incorrect page references on a standby when for example the background writer flushed a couple of pages on-disk but was not updating the control file to let a subsequent crash recovery replay to where it should have. The only case where this has been reported to be a problem is when a standby needs to calculate the latest removed xid when replaying a btree deletion record, so one would need connections on a standby that happen just after recovery has thought it reached a consistent point. Using a background worker which is started after the consistent point is reached would be the easiest way to get into problems if it connects to a database. Having clients which attempt to connect periodically could also be a problem, but the odds of seeing this problem are much lower. The fix used is pretty simple, as the idea is to give access to the minimum recovery point written in the control file to non-startup processes so as they use a reference, while the startup process still initializes its own references of the minimum consistent point so as the original problem with incorrect page references happening post-promotion with a crash do not show up. Reported-by: Alexander Kukushkin Diagnosed-by: Alexander Kukushkin Author: Michael Paquier Reviewed-by: Kyotaro Horiguchi, Alexander Kukushkin Discussion: https://postgr.es/m/153492341830.1368.3936905691758473953@wrigleys.postgresql.org Backpatch-through: 9.3
2018-08-31 20:03:40 +02:00
* here too. This triggers a quick exit path for the startup process,
* which cannot update its local copy of minRecoveryPoint as long as
* it has not replayed all WAL available when doing crash recovery.
Prevent references to invalid relation pages after fresh promotion If a standby crashes after promotion before having completed its first post-recovery checkpoint, then the minimal recovery point which marks the LSN position where the cluster is able to reach consistency may be set to a position older than the first end-of-recovery checkpoint while all the WAL available should be replayed. This leads to the instance thinking that it contains inconsistent pages, causing a PANIC and a hard instance crash even if all the WAL available has not been replayed for certain sets of records replayed. When in crash recovery, minRecoveryPoint is expected to always be set to InvalidXLogRecPtr, which forces the recovery to replay all the WAL available, so this commit makes sure that the local copy of minRecoveryPoint from the control file is initialized properly and stays as it is while crash recovery is performed. Once switching to archive recovery or if crash recovery finishes, then the local copy minRecoveryPoint can be safely updated. Pavan Deolasee has reported and diagnosed the failure in the first place, and the base fix idea to rely on the local copy of minRecoveryPoint comes from Kyotaro Horiguchi, which has been expanded into a full-fledged patch by me. The test included in this commit has been written by Álvaro Herrera and Pavan Deolasee, which I have modified to make it faster and more reliable with sleep phases. Backpatch down to all supported versions where the bug appears, aka 9.3 which is where the end-of-recovery checkpoint is not run by the startup process anymore. The test gets easily supported down to 10, still it has been tested on all branches. Reported-by: Pavan Deolasee Diagnosed-by: Pavan Deolasee Reviewed-by: Pavan Deolasee, Kyotaro Horiguchi Author: Michael Paquier, Kyotaro Horiguchi, Pavan Deolasee, Álvaro Herrera Discussion: https://postgr.es/m/CABOikdPOewjNL=05K5CbNMxnNtXnQjhTx2F--4p4ruorCjukbA@mail.gmail.com
2018-07-05 03:46:18 +02:00
*/
if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
Prevent references to invalid relation pages after fresh promotion If a standby crashes after promotion before having completed its first post-recovery checkpoint, then the minimal recovery point which marks the LSN position where the cluster is able to reach consistency may be set to a position older than the first end-of-recovery checkpoint while all the WAL available should be replayed. This leads to the instance thinking that it contains inconsistent pages, causing a PANIC and a hard instance crash even if all the WAL available has not been replayed for certain sets of records replayed. When in crash recovery, minRecoveryPoint is expected to always be set to InvalidXLogRecPtr, which forces the recovery to replay all the WAL available, so this commit makes sure that the local copy of minRecoveryPoint from the control file is initialized properly and stays as it is while crash recovery is performed. Once switching to archive recovery or if crash recovery finishes, then the local copy minRecoveryPoint can be safely updated. Pavan Deolasee has reported and diagnosed the failure in the first place, and the base fix idea to rely on the local copy of minRecoveryPoint comes from Kyotaro Horiguchi, which has been expanded into a full-fledged patch by me. The test included in this commit has been written by Álvaro Herrera and Pavan Deolasee, which I have modified to make it faster and more reliable with sleep phases. Backpatch down to all supported versions where the bug appears, aka 9.3 which is where the end-of-recovery checkpoint is not run by the startup process anymore. The test gets easily supported down to 10, still it has been tested on all branches. Reported-by: Pavan Deolasee Diagnosed-by: Pavan Deolasee Reviewed-by: Pavan Deolasee, Kyotaro Horiguchi Author: Michael Paquier, Kyotaro Horiguchi, Pavan Deolasee, Álvaro Herrera Discussion: https://postgr.es/m/CABOikdPOewjNL=05K5CbNMxnNtXnQjhTx2F--4p4ruorCjukbA@mail.gmail.com
2018-07-05 03:46:18 +02:00
updateMinRecoveryPoint = false;
/* Quick exit if already known to be updated or cannot be updated */
if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
return false;
/*
* Update local copy of minRecoveryPoint. But if the lock is busy,
* just return a conservative guess.
*/
if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
return true;
LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
LWLockRelease(ControlFileLock);
Ensure correct minimum consistent point on standbys Startup process has improved its calculation of incorrect minimum consistent point in 8d68ee6, which ensures that all WAL available gets replayed when doing crash recovery, and has introduced an incorrect calculation of the minimum recovery point for non-startup processes, which can cause incorrect page references on a standby when for example the background writer flushed a couple of pages on-disk but was not updating the control file to let a subsequent crash recovery replay to where it should have. The only case where this has been reported to be a problem is when a standby needs to calculate the latest removed xid when replaying a btree deletion record, so one would need connections on a standby that happen just after recovery has thought it reached a consistent point. Using a background worker which is started after the consistent point is reached would be the easiest way to get into problems if it connects to a database. Having clients which attempt to connect periodically could also be a problem, but the odds of seeing this problem are much lower. The fix used is pretty simple, as the idea is to give access to the minimum recovery point written in the control file to non-startup processes so as they use a reference, while the startup process still initializes its own references of the minimum consistent point so as the original problem with incorrect page references happening post-promotion with a crash do not show up. Reported-by: Alexander Kukushkin Diagnosed-by: Alexander Kukushkin Author: Michael Paquier Reviewed-by: Kyotaro Horiguchi, Alexander Kukushkin Discussion: https://postgr.es/m/153492341830.1368.3936905691758473953@wrigleys.postgresql.org Backpatch-through: 9.3
2018-08-31 20:03:40 +02:00
/*
* Check minRecoveryPoint for any other process than the startup
* process doing crash recovery, which should not update the control
* file value if crash recovery is still running.
*/
if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
Ensure correct minimum consistent point on standbys Startup process has improved its calculation of incorrect minimum consistent point in 8d68ee6, which ensures that all WAL available gets replayed when doing crash recovery, and has introduced an incorrect calculation of the minimum recovery point for non-startup processes, which can cause incorrect page references on a standby when for example the background writer flushed a couple of pages on-disk but was not updating the control file to let a subsequent crash recovery replay to where it should have. The only case where this has been reported to be a problem is when a standby needs to calculate the latest removed xid when replaying a btree deletion record, so one would need connections on a standby that happen just after recovery has thought it reached a consistent point. Using a background worker which is started after the consistent point is reached would be the easiest way to get into problems if it connects to a database. Having clients which attempt to connect periodically could also be a problem, but the odds of seeing this problem are much lower. The fix used is pretty simple, as the idea is to give access to the minimum recovery point written in the control file to non-startup processes so as they use a reference, while the startup process still initializes its own references of the minimum consistent point so as the original problem with incorrect page references happening post-promotion with a crash do not show up. Reported-by: Alexander Kukushkin Diagnosed-by: Alexander Kukushkin Author: Michael Paquier Reviewed-by: Kyotaro Horiguchi, Alexander Kukushkin Discussion: https://postgr.es/m/153492341830.1368.3936905691758473953@wrigleys.postgresql.org Backpatch-through: 9.3
2018-08-31 20:03:40 +02:00
updateMinRecoveryPoint = false;
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
/* check again */
if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
Ensure correct minimum consistent point on standbys Startup process has improved its calculation of incorrect minimum consistent point in 8d68ee6, which ensures that all WAL available gets replayed when doing crash recovery, and has introduced an incorrect calculation of the minimum recovery point for non-startup processes, which can cause incorrect page references on a standby when for example the background writer flushed a couple of pages on-disk but was not updating the control file to let a subsequent crash recovery replay to where it should have. The only case where this has been reported to be a problem is when a standby needs to calculate the latest removed xid when replaying a btree deletion record, so one would need connections on a standby that happen just after recovery has thought it reached a consistent point. Using a background worker which is started after the consistent point is reached would be the easiest way to get into problems if it connects to a database. Having clients which attempt to connect periodically could also be a problem, but the odds of seeing this problem are much lower. The fix used is pretty simple, as the idea is to give access to the minimum recovery point written in the control file to non-startup processes so as they use a reference, while the startup process still initializes its own references of the minimum consistent point so as the original problem with incorrect page references happening post-promotion with a crash do not show up. Reported-by: Alexander Kukushkin Diagnosed-by: Alexander Kukushkin Author: Michael Paquier Reviewed-by: Kyotaro Horiguchi, Alexander Kukushkin Discussion: https://postgr.es/m/153492341830.1368.3936905691758473953@wrigleys.postgresql.org Backpatch-through: 9.3
2018-08-31 20:03:40 +02:00
return false;
else
return true;
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
}
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/* Quick exit if already known flushed */
if (record <= LogwrtResult.Flush)
return false;
/* read LogwrtResult and update local state */
SpinLockAcquire(&XLogCtl->info_lck);
LogwrtResult = XLogCtl->LogwrtResult;
SpinLockRelease(&XLogCtl->info_lck);
/* check again */
if (record <= LogwrtResult.Flush)
return false;
return true;
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Try to make a given XLOG file segment exist.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*
* logsegno: identify segment.
*
* *added: on return, true if this call raised the number of extant segments.
*
* path: on return, this char[MAXPGPATH] has the path to the logsegno file.
*
* Returns -1 or FD of opened file. A -1 here is not an error; a caller
* wanting an open segment should attempt to open "path", which usually will
* succeed. (This is weird, but it's efficient for the callers.)
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
static int
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
bool *added, char *path)
{
char tmppath[MAXPGPATH];
PGAlignedXLogBlock zbuffer;
XLogSegNo installed_segno;
XLogSegNo max_segno;
int fd;
int save_errno;
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
Assert(logtli != 0);
XLogFilePath(path, logtli, logsegno, wal_segment_size);
/*
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
* Try to use existent file (checkpoint maker may have created it already)
*/
*added = false;
fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
if (fd < 0)
{
if (errno != ENOENT)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", path)));
}
else
return fd;
/*
* Initialize an empty (all zeroes) segment. NOTE: it is possible that
* another process is doing the same thing. If so, we will end up
* pre-creating an extra log segment. That seems OK, and better than
* holding the lock throughout this lengthy process.
*/
elog(DEBUG2, "creating and filling new WAL file");
snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
unlink(tmppath);
/* do not use get_sync_bit() here --- want to fsync only at end of fill */
fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
if (fd < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not create file \"%s\": %m", tmppath)));
memset(zbuffer.data, 0, XLOG_BLCKSZ);
pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
save_errno = 0;
if (wal_init_zero)
{
struct iovec iov[PG_IOV_MAX];
int blocks;
/*
* Zero-fill the file. With this setting, we do this the hard way to
* ensure that all the file space has really been allocated. On
* platforms that allow "holes" in files, just seeking to the end
* doesn't allocate intermediate space. This way, we know that we
* have all the space and (after the fsync below) that all the
* indirect blocks are down on disk. Therefore, fdatasync(2) or
* O_DSYNC will be sufficient to sync future writes to the log file.
*/
/* Prepare to write out a lot of copies of our zero buffer at once. */
for (int i = 0; i < lengthof(iov); ++i)
{
iov[i].iov_base = zbuffer.data;
iov[i].iov_len = XLOG_BLCKSZ;
}
/* Loop, writing as many blocks as we can for each system call. */
blocks = wal_segment_size / XLOG_BLCKSZ;
for (int i = 0; i < blocks;)
{
int iovcnt = Min(blocks - i, lengthof(iov));
off_t offset = i * XLOG_BLCKSZ;
if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0)
{
save_errno = errno;
break;
}
i += iovcnt;
}
}
else
{
/*
* Otherwise, seeking to the end and writing a solitary byte is
* enough.
*/
errno = 0;
if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
{
/* if write didn't set errno, assume no disk space */
save_errno = errno ? errno : ENOSPC;
}
}
pgstat_report_wait_end();
if (save_errno)
{
/*
* If we fail to make the file, delete it to release disk space
*/
unlink(tmppath);
close(fd);
errno = save_errno;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not write to file \"%s\": %m", tmppath)));
}
pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
if (pg_fsync(fd) != 0)
{
int save_errno = errno;
close(fd);
errno = save_errno;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", tmppath)));
}
pgstat_report_wait_end();
if (close(fd) != 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not close file \"%s\": %m", tmppath)));
2000-11-27 06:36:12 +01:00
/*
* Now move the segment into place with its final name. Cope with
* possibility that someone else has created the file while we were
* filling ours: if so, use ours to pre-create a future log segment.
*/
installed_segno = logsegno;
/*
* XXX: What should we use as max_segno? We used to use XLOGfileslop when
* that was a constant, but that was always a bit dubious: normally, at a
* checkpoint, XLOGfileslop was the offset from the checkpoint record, but
* here, it was the offset from the insert location. We can't do the
* normal XLOGfileslop calculation here because we don't have access to
* the prior checkpoint's redo location. So somewhat arbitrarily, just use
* CheckPointSegments.
*/
max_segno = logsegno + CheckPointSegments;
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
if (InstallXLogFileSegment(&installed_segno, tmppath, true, max_segno,
logtli))
{
*added = true;
elog(DEBUG2, "done creating and filling new WAL file");
}
else
{
Don't error out if recycling or removing an old WAL segment fails at the end of checkpoint. Although the checkpoint has been written to WAL at that point already, so that all data is safe, and we'll retry removing the WAL segment at the next checkpoint, if such a failure persists we won't be able to remove any other old WAL segments either and will eventually run out of disk space. It's better to treat the failure as non-fatal, and move on to clean any other WAL segment and continue with any other end-of-checkpoint cleanup. We don't normally expect any such failures, but on Windows it can happen with some anti-virus or backup software that lock files without FILE_SHARE_DELETE flag. Also, the loop in pgrename() to retry when the file is locked was broken. If a file is locked on Windows, you get ERROR_SHARE_VIOLATION, not ERROR_ACCESS_DENIED, at least on modern versions. Fix that, although I left the check for ERROR_ACCESS_DENIED in there as well (presumably it was correct in some environment), and added ERROR_LOCK_VIOLATION to be consistent with similar checks in pgwin32_open(). Reduce the timeout on the loop from 30s to 10s, on the grounds that since it's been broken, we've effectively had a timeout of 0s and no-one has complained, so a smaller timeout is actually closer to the old behavior. A longer timeout would mean that if recycling a WAL file fails because it's locked for some reason, InstallXLogFileSegment() will hold ControlFileLock for longer, potentially blocking other backends, so a long timeout isn't totally harmless. While we're at it, set errno correctly in pgrename(). Backpatch to 8.2, which is the oldest version supported on Windows. The xlog.c changes would make sense on other platforms and thus on older versions as well, but since there's no such locking issues on other platforms, it's not worth it.
2009-09-13 20:32:08 +02:00
/*
* No need for any more future segments, or InstallXLogFileSegment()
* failed to rename the file into place. If the rename failed, a
* caller opening the file may fail.
Don't error out if recycling or removing an old WAL segment fails at the end of checkpoint. Although the checkpoint has been written to WAL at that point already, so that all data is safe, and we'll retry removing the WAL segment at the next checkpoint, if such a failure persists we won't be able to remove any other old WAL segments either and will eventually run out of disk space. It's better to treat the failure as non-fatal, and move on to clean any other WAL segment and continue with any other end-of-checkpoint cleanup. We don't normally expect any such failures, but on Windows it can happen with some anti-virus or backup software that lock files without FILE_SHARE_DELETE flag. Also, the loop in pgrename() to retry when the file is locked was broken. If a file is locked on Windows, you get ERROR_SHARE_VIOLATION, not ERROR_ACCESS_DENIED, at least on modern versions. Fix that, although I left the check for ERROR_ACCESS_DENIED in there as well (presumably it was correct in some environment), and added ERROR_LOCK_VIOLATION to be consistent with similar checks in pgwin32_open(). Reduce the timeout on the loop from 30s to 10s, on the grounds that since it's been broken, we've effectively had a timeout of 0s and no-one has complained, so a smaller timeout is actually closer to the old behavior. A longer timeout would mean that if recycling a WAL file fails because it's locked for some reason, InstallXLogFileSegment() will hold ControlFileLock for longer, potentially blocking other backends, so a long timeout isn't totally harmless. While we're at it, set errno correctly in pgrename(). Backpatch to 8.2, which is the oldest version supported on Windows. The xlog.c changes would make sense on other platforms and thus on older versions as well, but since there's no such locking issues on other platforms, it's not worth it.
2009-09-13 20:32:08 +02:00
*/
unlink(tmppath);
elog(DEBUG2, "abandoned new WAL file");
}
return -1;
}
/*
* Create a new XLOG file segment, or open a pre-existing one.
*
* logsegno: identify segment to be created/opened.
*
* Returns FD of opened file.
*
* Note: errors here are ERROR not PANIC because we might or might not be
* inside a critical section (eg, during checkpoint there is no reason to
* take down the system on failure). They will promote to PANIC if we are
* in a critical section.
*/
int
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
XLogFileInit(XLogSegNo logsegno, TimeLineID logtli)
{
bool ignore_added;
char path[MAXPGPATH];
int fd;
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
Assert(logtli != 0);
fd = XLogFileInitInternal(logsegno, logtli, &ignore_added, path);
if (fd >= 0)
return fd;
/* Now open original target segment (might not be file I just made) */
fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
if (fd < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", path)));
return fd;
}
/*
* Create a new XLOG file segment by copying a pre-existing one.
*
* destsegno: identify segment to be created.
*
2016-03-15 21:57:17 +01:00
* srcTLI, srcsegno: identify segment to be copied (could be from
* a different timeline)
*
* upto: how much of the source file to copy (the rest is filled with
* zeros)
*
* Currently this is only used during recovery, and so there are no locking
* considerations. But we should be just as tense as XLogFileInit to avoid
* emplacing a bogus file.
*/
static void
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogFileCopy(TimeLineID destTLI, XLogSegNo destsegno,
TimeLineID srcTLI, XLogSegNo srcsegno,
int upto)
{
char path[MAXPGPATH];
char tmppath[MAXPGPATH];
PGAlignedXLogBlock buffer;
int srcfd;
int fd;
int nbytes;
/*
* Open the source file
*/
XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
if (srcfd < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", path)));
/*
* Copy into a temp file name.
*/
snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
unlink(tmppath);
/* do not use get_sync_bit() here --- want to fsync only at end of fill */
fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
if (fd < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not create file \"%s\": %m", tmppath)));
/*
* Do the data copying.
*/
for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
{
int nread;
nread = upto - nbytes;
/*
* The part that is not read from the source file is filled with
* zeros.
*/
if (nread < sizeof(buffer))
memset(buffer.data, 0, sizeof(buffer));
if (nread > 0)
{
int r;
if (nread > sizeof(buffer))
nread = sizeof(buffer);
pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
r = read(srcfd, buffer.data, nread);
if (r != nread)
{
if (r < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m",
path)));
else
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("could not read file \"%s\": read %d of %zu",
path, r, (Size) nread)));
}
pgstat_report_wait_end();
}
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
{
int save_errno = errno;
/*
* If we fail to make the file, delete it to release disk space
*/
unlink(tmppath);
/* if write didn't set errno, assume problem is no disk space */
errno = save_errno ? save_errno : ENOSPC;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not write to file \"%s\": %m", tmppath)));
}
pgstat_report_wait_end();
}
pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
if (pg_fsync(fd) != 0)
PANIC on fsync() failure. On some operating systems, it doesn't make sense to retry fsync(), because dirty data cached by the kernel may have been dropped on write-back failure. In that case the only remaining copy of the data is in the WAL. A subsequent fsync() could appear to succeed, but not have flushed the data. That means that a future checkpoint could apparently complete successfully but have lost data. Therefore, violently prevent any future checkpoint attempts by panicking on the first fsync() failure. Note that we already did the same for WAL data; this change extends that behavior to non-temporary data files. Provide a GUC data_sync_retry to control this new behavior, for users of operating systems that don't eject dirty data, and possibly forensic/testing uses. If it is set to on and the write-back error was transient, a later checkpoint might genuinely succeed (on a system that does not throw away buffers on failure); if the error is permanent, later checkpoints will continue to fail. The GUC defaults to off, meaning that we panic. Back-patch to all supported releases. There is still a narrow window for error-loss on some operating systems: if the file is closed and later reopened and a write-back error occurs in the intervening time, but the inode has the bad luck to be evicted due to memory pressure before we reopen, we could miss the error. A later patch will address that with a scheme for keeping files with dirty data open at all times, but we judge that to be too complicated to back-patch. Author: Craig Ringer, with some adjustments by Thomas Munro Reported-by: Craig Ringer Reviewed-by: Robert Haas, Thomas Munro, Andres Freund Discussion: https://postgr.es/m/20180427222842.in2e4mibx45zdth5%40alap3.anarazel.de
2018-11-19 01:31:10 +01:00
ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", tmppath)));
pgstat_report_wait_end();
if (CloseTransientFile(fd) != 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not close file \"%s\": %m", tmppath)));
if (CloseTransientFile(srcfd) != 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not close file \"%s\": %m", path)));
/*
* Now move the segment into place with its final name.
*/
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, destTLI))
elog(ERROR, "InstallXLogFileSegment should not have failed");
}
/*
* Install a new XLOG segment file as a current or future log segment.
*
* This is used both to install a newly-created segment (which has a temp
* filename while it's being created) and to recycle an old segment.
*
* *segno: identify segment to install as (or first possible target).
* When find_free is true, this is modified on return to indicate the
* actual installation location or last segment searched.
*
* tmppath: initial name of file to install. It will be renamed into place.
*
* find_free: if true, install the new segment at the first empty segno
* number at or after the passed numbers. If false, install the new segment
* exactly where specified, deleting any existing segment file there.
*
* max_segno: maximum segment number to install the new file as. Fail if no
* free slot is found between *segno and max_segno. (Ignored when find_free
* is false.)
*
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
* tli: The timeline on which the new segment should be installed.
*
* Returns true if the file was installed successfully. false indicates that
* max_segno limit was exceeded, the startup process has disabled this
* function for now, or an error occurred while renaming the file into place.
*/
static bool
InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
bool find_free, XLogSegNo max_segno, TimeLineID tli)
{
char path[MAXPGPATH];
struct stat stat_buf;
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
Assert(tli != 0);
XLogFilePath(path, tli, *segno, wal_segment_size);
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (!XLogCtl->InstallXLogFileSegmentActive)
{
LWLockRelease(ControlFileLock);
return false;
}
if (!find_free)
{
/* Force installation: get rid of any pre-existing segment file */
durable_unlink(path, DEBUG1);
}
else
{
/* Find a free slot to put it in */
while (stat(path, &stat_buf) == 0)
{
if ((*segno) >= max_segno)
{
/* Failed to find a free slot within specified range */
LWLockRelease(ControlFileLock);
return false;
}
(*segno)++;
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
XLogFilePath(path, tli, *segno, wal_segment_size);
}
}
/*
* Perform the rename using link if available, paranoidly trying to avoid
* overwriting an existing file (there shouldn't be one).
*/
if (durable_rename_excl(tmppath, path, LOG) != 0)
Don't error out if recycling or removing an old WAL segment fails at the end of checkpoint. Although the checkpoint has been written to WAL at that point already, so that all data is safe, and we'll retry removing the WAL segment at the next checkpoint, if such a failure persists we won't be able to remove any other old WAL segments either and will eventually run out of disk space. It's better to treat the failure as non-fatal, and move on to clean any other WAL segment and continue with any other end-of-checkpoint cleanup. We don't normally expect any such failures, but on Windows it can happen with some anti-virus or backup software that lock files without FILE_SHARE_DELETE flag. Also, the loop in pgrename() to retry when the file is locked was broken. If a file is locked on Windows, you get ERROR_SHARE_VIOLATION, not ERROR_ACCESS_DENIED, at least on modern versions. Fix that, although I left the check for ERROR_ACCESS_DENIED in there as well (presumably it was correct in some environment), and added ERROR_LOCK_VIOLATION to be consistent with similar checks in pgwin32_open(). Reduce the timeout on the loop from 30s to 10s, on the grounds that since it's been broken, we've effectively had a timeout of 0s and no-one has complained, so a smaller timeout is actually closer to the old behavior. A longer timeout would mean that if recycling a WAL file fails because it's locked for some reason, InstallXLogFileSegment() will hold ControlFileLock for longer, potentially blocking other backends, so a long timeout isn't totally harmless. While we're at it, set errno correctly in pgrename(). Backpatch to 8.2, which is the oldest version supported on Windows. The xlog.c changes would make sense on other platforms and thus on older versions as well, but since there's no such locking issues on other platforms, it's not worth it.
2009-09-13 20:32:08 +02:00
{
LWLockRelease(ControlFileLock);
/* durable_rename_excl already emitted log message */
Don't error out if recycling or removing an old WAL segment fails at the end of checkpoint. Although the checkpoint has been written to WAL at that point already, so that all data is safe, and we'll retry removing the WAL segment at the next checkpoint, if such a failure persists we won't be able to remove any other old WAL segments either and will eventually run out of disk space. It's better to treat the failure as non-fatal, and move on to clean any other WAL segment and continue with any other end-of-checkpoint cleanup. We don't normally expect any such failures, but on Windows it can happen with some anti-virus or backup software that lock files without FILE_SHARE_DELETE flag. Also, the loop in pgrename() to retry when the file is locked was broken. If a file is locked on Windows, you get ERROR_SHARE_VIOLATION, not ERROR_ACCESS_DENIED, at least on modern versions. Fix that, although I left the check for ERROR_ACCESS_DENIED in there as well (presumably it was correct in some environment), and added ERROR_LOCK_VIOLATION to be consistent with similar checks in pgwin32_open(). Reduce the timeout on the loop from 30s to 10s, on the grounds that since it's been broken, we've effectively had a timeout of 0s and no-one has complained, so a smaller timeout is actually closer to the old behavior. A longer timeout would mean that if recycling a WAL file fails because it's locked for some reason, InstallXLogFileSegment() will hold ControlFileLock for longer, potentially blocking other backends, so a long timeout isn't totally harmless. While we're at it, set errno correctly in pgrename(). Backpatch to 8.2, which is the oldest version supported on Windows. The xlog.c changes would make sense on other platforms and thus on older versions as well, but since there's no such locking issues on other platforms, it's not worth it.
2009-09-13 20:32:08 +02:00
return false;
}
LWLockRelease(ControlFileLock);
return true;
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Open a pre-existing logfile segment for writing.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
int
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogFileOpen(XLogSegNo segno, TimeLineID tli)
{
char path[MAXPGPATH];
int fd;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogFilePath(path, tli, segno, wal_segment_size);
fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
if (fd < 0)
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", path)));
return fd;
}
/*
* Close the current logfile segment for writing.
*/
static void
XLogFileClose(void)
{
Assert(openLogFile >= 0);
/*
* WAL segment files will not be re-read in normal operation, so we advise
* the OS to release any cached pages. But do not do so if WAL archiving
* or streaming is active, because archiver and walsender process could
* use the cache to read the WAL segment.
*/
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
if (!XLogIsNeeded())
(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
#endif
if (close(openLogFile) != 0)
{
char xlogfname[MAXFNAMELEN];
int save_errno = errno;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogFileName(xlogfname, openLogTLI, openLogSegNo, wal_segment_size);
errno = save_errno;
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not close file \"%s\": %m", xlogfname)));
}
openLogFile = -1;
Account explicitly for long-lived FDs that are allocated outside fd.c. The comments in fd.c have long claimed that all file allocations should go through that module, but in reality that's not always practical. fd.c doesn't supply APIs for invoking some FD-producing syscalls like pipe() or epoll_create(); and the APIs it does supply for non-virtual FDs are mostly insistent on releasing those FDs at transaction end; and in some cases the actual open() call is in code that can't be made to use fd.c, such as libpq. This has led to a situation where, in a modern server, there are likely to be seven or so long-lived FDs per backend process that are not known to fd.c. Since NUM_RESERVED_FDS is only 10, that meant we had *very* few spare FDs if max_files_per_process is >= the system ulimit and fd.c had opened all the files it thought it safely could. The contrib/postgres_fdw regression test, in particular, could easily be made to fall over by running it under a restrictive ulimit. To improve matters, invent functions Acquire/Reserve/ReleaseExternalFD that allow outside callers to tell fd.c that they have or want to allocate a FD that's not directly managed by fd.c. Add calls to track all the fixed FDs in a standard backend session, so that we are honestly guaranteeing that NUM_RESERVED_FDS FDs remain unused below the EMFILE limit in a backend's idle state. The coding rules for these functions say that there's no need to call them in code that just allocates one FD over a fairly short interval; we can dip into NUM_RESERVED_FDS for such cases. That means that there aren't all that many places where we need to worry. But postgres_fdw and dblink must use this facility to account for long-lived FDs consumed by libpq connections. There may be other places where it's worth doing such accounting, too, but this seems like enough to solve the immediate problem. Internally to fd.c, "external" FDs are limited to max_safe_fds/3 FDs. (Callers can choose to ignore this limit, but of course it's unwise to do so except for fixed file allocations.) I also reduced the limit on "allocated" files to max_safe_fds/3 FDs (it had been max_safe_fds/2). Conceivably a smarter rule could be used here --- but in practice, on reasonable systems, max_safe_fds should be large enough that this isn't much of an issue, so KISS for now. To avoid possible regression in the number of external or allocated files that can be opened, increase FD_MINFREE and the lower limit on max_files_per_process a little bit; we now insist that the effective "ulimit -n" be at least 64. This seems like pretty clearly a bug fix, but in view of the lack of field complaints, I'll refrain from risking a back-patch. Discussion: https://postgr.es/m/E1izCmM-0005pV-Co@gemulon.postgresql.org
2020-02-24 23:28:33 +01:00
ReleaseExternalFD();
}
/*
* Preallocate log files beyond the specified log endpoint.
*
* XXX this is currently extremely conservative, since it forces only one
* future log segment to exist, and even that only if we are 75% done with
* the current one. This is only appropriate for very low-WAL-volume systems.
* High-volume systems will be OK once they've built up a sufficient set of
* recycled log segments, but the startup transient is likely to include
* a lot of segment creations by foreground processes, which is not so good.
*
* XLogFileInitInternal() can ereport(ERROR). All known causes indicate big
* trouble; for example, a full filesystem is one cause. The checkpoint WAL
* and/or ControlFile updates already completed. If a RequestCheckpoint()
* initiated the present checkpoint and an ERROR ends this function, the
* command that called RequestCheckpoint() fails. That's not ideal, but it's
* not worth contorting more functions to use caller-specified elevel values.
* (With or without RequestCheckpoint(), an ERROR forestalls some inessential
* reporting and resource reclamation.)
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
static void
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli)
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
{
XLogSegNo _logSegNo;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
int lf;
bool added;
char path[MAXPGPATH];
uint64 offset;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
if (!XLogCtl->InstallXLogFileSegmentActive)
return; /* unlocked check says no */
XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
if (offset >= (uint32) (0.75 * wal_segment_size))
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
{
_logSegNo++;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
lf = XLogFileInitInternal(_logSegNo, tli, &added, path);
if (lf >= 0)
close(lf);
if (added)
CheckpointStats.ckpt_segs_added++;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
}
}
/*
* Throws an error if the given log segment has already been removed or
* recycled. The caller should only pass a segment that it knows to have
* existed while the server has been running, as this function always
* succeeds if no WAL segments have been removed since startup.
* 'tli' is only used in the error message.
Clean up assorted messiness around AllocateDir() usage. This patch fixes a couple of low-probability bugs that could lead to reporting an irrelevant errno value (and hence possibly a wrong SQLSTATE) concerning directory-open or file-open failures. It also fixes places where we took shortcuts in reporting such errors, either by using elog instead of ereport or by using ereport but forgetting to specify an errcode. And it eliminates a lot of just plain redundant error-handling code. In service of all this, export fd.c's formerly-static function ReadDirExtended, so that external callers can make use of the coding pattern dir = AllocateDir(path); while ((de = ReadDirExtended(dir, path, LOG)) != NULL) if they'd like to treat directory-open failures as mere LOG conditions rather than errors. Also fix FreeDir to be a no-op if we reach it with dir == NULL, as such a coding pattern would cause. Then, remove code at many call sites that was throwing an error or log message for AllocateDir failure, as ReadDir or ReadDirExtended can handle that job just fine. Aside from being a net code savings, this gets rid of a lot of not-quite-up-to-snuff reports, as mentioned above. (In some places these changes result in replacing a custom error message such as "could not open tablespace directory" with more generic wording "could not open directory", but it was agreed that the custom wording buys little as long as we report the directory name.) In some other call sites where we can't just remove code, change the error reports to be fully project-style-compliant. Also reorder code in restoreTwoPhaseData that was acquiring a lock between AllocateDir and ReadDir; in the unlikely but surely not impossible case that LWLockAcquire changes errno, AllocateDir failures would be misreported. There is no great value in opening the directory before acquiring TwoPhaseStateLock, so just do it in the other order. Also fix CheckXLogRemoved to guarantee that it preserves errno, as quite a number of call sites are implicitly assuming. (Again, it's unlikely but I think not impossible that errno could change during a SpinLockAcquire. If so, this function was broken for its own purposes as well as breaking callers.) And change a few places that were using not-per-project-style messages, such as "could not read directory" when "could not open directory" is more correct. Back-patch the exporting of ReadDirExtended, in case we have occasion to back-patch some fix that makes use of it; it's not needed right now but surely making it global is pretty harmless. Also back-patch the restoreTwoPhaseData and CheckXLogRemoved fixes. The rest of this is essentially cosmetic and need not get back-patched. Michael Paquier, with a bit of additional work by me Discussion: https://postgr.es/m/CAB7nPqRpOCxjiirHmebEFhXVTK7V5Jvw4bz82p7Oimtsm3TyZA@mail.gmail.com
2017-12-04 23:02:52 +01:00
*
* Note: this function guarantees to keep errno unchanged on return.
* This supports callers that use this to possibly deliver a better
* error message about a missing file, while still being able to throw
* a normal file-access error afterwards, if this does return.
*/
void
CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
{
Clean up assorted messiness around AllocateDir() usage. This patch fixes a couple of low-probability bugs that could lead to reporting an irrelevant errno value (and hence possibly a wrong SQLSTATE) concerning directory-open or file-open failures. It also fixes places where we took shortcuts in reporting such errors, either by using elog instead of ereport or by using ereport but forgetting to specify an errcode. And it eliminates a lot of just plain redundant error-handling code. In service of all this, export fd.c's formerly-static function ReadDirExtended, so that external callers can make use of the coding pattern dir = AllocateDir(path); while ((de = ReadDirExtended(dir, path, LOG)) != NULL) if they'd like to treat directory-open failures as mere LOG conditions rather than errors. Also fix FreeDir to be a no-op if we reach it with dir == NULL, as such a coding pattern would cause. Then, remove code at many call sites that was throwing an error or log message for AllocateDir failure, as ReadDir or ReadDirExtended can handle that job just fine. Aside from being a net code savings, this gets rid of a lot of not-quite-up-to-snuff reports, as mentioned above. (In some places these changes result in replacing a custom error message such as "could not open tablespace directory" with more generic wording "could not open directory", but it was agreed that the custom wording buys little as long as we report the directory name.) In some other call sites where we can't just remove code, change the error reports to be fully project-style-compliant. Also reorder code in restoreTwoPhaseData that was acquiring a lock between AllocateDir and ReadDir; in the unlikely but surely not impossible case that LWLockAcquire changes errno, AllocateDir failures would be misreported. There is no great value in opening the directory before acquiring TwoPhaseStateLock, so just do it in the other order. Also fix CheckXLogRemoved to guarantee that it preserves errno, as quite a number of call sites are implicitly assuming. (Again, it's unlikely but I think not impossible that errno could change during a SpinLockAcquire. If so, this function was broken for its own purposes as well as breaking callers.) And change a few places that were using not-per-project-style messages, such as "could not read directory" when "could not open directory" is more correct. Back-patch the exporting of ReadDirExtended, in case we have occasion to back-patch some fix that makes use of it; it's not needed right now but surely making it global is pretty harmless. Also back-patch the restoreTwoPhaseData and CheckXLogRemoved fixes. The rest of this is essentially cosmetic and need not get back-patched. Michael Paquier, with a bit of additional work by me Discussion: https://postgr.es/m/CAB7nPqRpOCxjiirHmebEFhXVTK7V5Jvw4bz82p7Oimtsm3TyZA@mail.gmail.com
2017-12-04 23:02:52 +01:00
int save_errno = errno;
XLogSegNo lastRemovedSegNo;
SpinLockAcquire(&XLogCtl->info_lck);
lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
SpinLockRelease(&XLogCtl->info_lck);
if (segno <= lastRemovedSegNo)
{
char filename[MAXFNAMELEN];
XLogFileName(filename, tli, segno, wal_segment_size);
Clean up assorted messiness around AllocateDir() usage. This patch fixes a couple of low-probability bugs that could lead to reporting an irrelevant errno value (and hence possibly a wrong SQLSTATE) concerning directory-open or file-open failures. It also fixes places where we took shortcuts in reporting such errors, either by using elog instead of ereport or by using ereport but forgetting to specify an errcode. And it eliminates a lot of just plain redundant error-handling code. In service of all this, export fd.c's formerly-static function ReadDirExtended, so that external callers can make use of the coding pattern dir = AllocateDir(path); while ((de = ReadDirExtended(dir, path, LOG)) != NULL) if they'd like to treat directory-open failures as mere LOG conditions rather than errors. Also fix FreeDir to be a no-op if we reach it with dir == NULL, as such a coding pattern would cause. Then, remove code at many call sites that was throwing an error or log message for AllocateDir failure, as ReadDir or ReadDirExtended can handle that job just fine. Aside from being a net code savings, this gets rid of a lot of not-quite-up-to-snuff reports, as mentioned above. (In some places these changes result in replacing a custom error message such as "could not open tablespace directory" with more generic wording "could not open directory", but it was agreed that the custom wording buys little as long as we report the directory name.) In some other call sites where we can't just remove code, change the error reports to be fully project-style-compliant. Also reorder code in restoreTwoPhaseData that was acquiring a lock between AllocateDir and ReadDir; in the unlikely but surely not impossible case that LWLockAcquire changes errno, AllocateDir failures would be misreported. There is no great value in opening the directory before acquiring TwoPhaseStateLock, so just do it in the other order. Also fix CheckXLogRemoved to guarantee that it preserves errno, as quite a number of call sites are implicitly assuming. (Again, it's unlikely but I think not impossible that errno could change during a SpinLockAcquire. If so, this function was broken for its own purposes as well as breaking callers.) And change a few places that were using not-per-project-style messages, such as "could not read directory" when "could not open directory" is more correct. Back-patch the exporting of ReadDirExtended, in case we have occasion to back-patch some fix that makes use of it; it's not needed right now but surely making it global is pretty harmless. Also back-patch the restoreTwoPhaseData and CheckXLogRemoved fixes. The rest of this is essentially cosmetic and need not get back-patched. Michael Paquier, with a bit of additional work by me Discussion: https://postgr.es/m/CAB7nPqRpOCxjiirHmebEFhXVTK7V5Jvw4bz82p7Oimtsm3TyZA@mail.gmail.com
2017-12-04 23:02:52 +01:00
errno = save_errno;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("requested WAL segment %s has already been removed",
filename)));
}
Clean up assorted messiness around AllocateDir() usage. This patch fixes a couple of low-probability bugs that could lead to reporting an irrelevant errno value (and hence possibly a wrong SQLSTATE) concerning directory-open or file-open failures. It also fixes places where we took shortcuts in reporting such errors, either by using elog instead of ereport or by using ereport but forgetting to specify an errcode. And it eliminates a lot of just plain redundant error-handling code. In service of all this, export fd.c's formerly-static function ReadDirExtended, so that external callers can make use of the coding pattern dir = AllocateDir(path); while ((de = ReadDirExtended(dir, path, LOG)) != NULL) if they'd like to treat directory-open failures as mere LOG conditions rather than errors. Also fix FreeDir to be a no-op if we reach it with dir == NULL, as such a coding pattern would cause. Then, remove code at many call sites that was throwing an error or log message for AllocateDir failure, as ReadDir or ReadDirExtended can handle that job just fine. Aside from being a net code savings, this gets rid of a lot of not-quite-up-to-snuff reports, as mentioned above. (In some places these changes result in replacing a custom error message such as "could not open tablespace directory" with more generic wording "could not open directory", but it was agreed that the custom wording buys little as long as we report the directory name.) In some other call sites where we can't just remove code, change the error reports to be fully project-style-compliant. Also reorder code in restoreTwoPhaseData that was acquiring a lock between AllocateDir and ReadDir; in the unlikely but surely not impossible case that LWLockAcquire changes errno, AllocateDir failures would be misreported. There is no great value in opening the directory before acquiring TwoPhaseStateLock, so just do it in the other order. Also fix CheckXLogRemoved to guarantee that it preserves errno, as quite a number of call sites are implicitly assuming. (Again, it's unlikely but I think not impossible that errno could change during a SpinLockAcquire. If so, this function was broken for its own purposes as well as breaking callers.) And change a few places that were using not-per-project-style messages, such as "could not read directory" when "could not open directory" is more correct. Back-patch the exporting of ReadDirExtended, in case we have occasion to back-patch some fix that makes use of it; it's not needed right now but surely making it global is pretty harmless. Also back-patch the restoreTwoPhaseData and CheckXLogRemoved fixes. The rest of this is essentially cosmetic and need not get back-patched. Michael Paquier, with a bit of additional work by me Discussion: https://postgr.es/m/CAB7nPqRpOCxjiirHmebEFhXVTK7V5Jvw4bz82p7Oimtsm3TyZA@mail.gmail.com
2017-12-04 23:02:52 +01:00
errno = save_errno;
}
/*
* Return the last WAL segment removed, or 0 if no segment has been removed
* since startup.
*
* NB: the result can be out of date arbitrarily fast, the caller has to deal
* with that.
*/
XLogSegNo
XLogGetLastRemovedSegno(void)
{
XLogSegNo lastRemovedSegNo;
SpinLockAcquire(&XLogCtl->info_lck);
lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
SpinLockRelease(&XLogCtl->info_lck);
return lastRemovedSegNo;
}
/*
* Update the last removed segno pointer in shared memory, to reflect that the
* given XLOG file has been removed.
*/
static void
UpdateLastRemovedPtr(char *filename)
{
uint32 tli;
XLogSegNo segno;
XLogFromFileName(filename, &tli, &segno, wal_segment_size);
SpinLockAcquire(&XLogCtl->info_lck);
if (segno > XLogCtl->lastRemovedSegNo)
XLogCtl->lastRemovedSegNo = segno;
SpinLockRelease(&XLogCtl->info_lck);
}
/*
* Remove all temporary log files in pg_wal
*
* This is called at the beginning of recovery after a previous crash,
* at a point where no other processes write fresh WAL data.
*/
static void
RemoveTempXlogFiles(void)
{
DIR *xldir;
struct dirent *xlde;
elog(DEBUG2, "removing all temporary WAL segments");
xldir = AllocateDir(XLOGDIR);
while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
{
char path[MAXPGPATH];
if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
continue;
snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
unlink(path);
elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
}
FreeDir(xldir);
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Recycle or remove all log files older or equal to passed segno.
*
* endptr is current (or recent) end of xlog, and lastredoptr is the
* redo pointer of the last checkpoint. These are used to determine
* whether we want to recycle rather than delete no-longer-wanted log files.
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
*
* insertTLI is the current timeline for XLOG insertion. Any recycled
* segments should be reused for this timeline.
*/
static void
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr,
TimeLineID insertTLI)
{
DIR *xldir;
struct dirent *xlde;
char lastoff[MAXFNAMELEN];
XLogSegNo endlogSegNo;
XLogSegNo recycleSegNo;
/* Initialize info about where to try to recycle to */
XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
recycleSegNo = XLOGfileslop(lastredoptr);
/*
* Construct a filename of the last segment to be kept. The timeline ID
* doesn't matter, we ignore that in the comparison. (During recovery,
* InsertTimeLineID isn't set, so we can't use that.)
*/
XLogFileName(lastoff, 0, segno, wal_segment_size);
elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
lastoff);
Clean up assorted messiness around AllocateDir() usage. This patch fixes a couple of low-probability bugs that could lead to reporting an irrelevant errno value (and hence possibly a wrong SQLSTATE) concerning directory-open or file-open failures. It also fixes places where we took shortcuts in reporting such errors, either by using elog instead of ereport or by using ereport but forgetting to specify an errcode. And it eliminates a lot of just plain redundant error-handling code. In service of all this, export fd.c's formerly-static function ReadDirExtended, so that external callers can make use of the coding pattern dir = AllocateDir(path); while ((de = ReadDirExtended(dir, path, LOG)) != NULL) if they'd like to treat directory-open failures as mere LOG conditions rather than errors. Also fix FreeDir to be a no-op if we reach it with dir == NULL, as such a coding pattern would cause. Then, remove code at many call sites that was throwing an error or log message for AllocateDir failure, as ReadDir or ReadDirExtended can handle that job just fine. Aside from being a net code savings, this gets rid of a lot of not-quite-up-to-snuff reports, as mentioned above. (In some places these changes result in replacing a custom error message such as "could not open tablespace directory" with more generic wording "could not open directory", but it was agreed that the custom wording buys little as long as we report the directory name.) In some other call sites where we can't just remove code, change the error reports to be fully project-style-compliant. Also reorder code in restoreTwoPhaseData that was acquiring a lock between AllocateDir and ReadDir; in the unlikely but surely not impossible case that LWLockAcquire changes errno, AllocateDir failures would be misreported. There is no great value in opening the directory before acquiring TwoPhaseStateLock, so just do it in the other order. Also fix CheckXLogRemoved to guarantee that it preserves errno, as quite a number of call sites are implicitly assuming. (Again, it's unlikely but I think not impossible that errno could change during a SpinLockAcquire. If so, this function was broken for its own purposes as well as breaking callers.) And change a few places that were using not-per-project-style messages, such as "could not read directory" when "could not open directory" is more correct. Back-patch the exporting of ReadDirExtended, in case we have occasion to back-patch some fix that makes use of it; it's not needed right now but surely making it global is pretty harmless. Also back-patch the restoreTwoPhaseData and CheckXLogRemoved fixes. The rest of this is essentially cosmetic and need not get back-patched. Michael Paquier, with a bit of additional work by me Discussion: https://postgr.es/m/CAB7nPqRpOCxjiirHmebEFhXVTK7V5Jvw4bz82p7Oimtsm3TyZA@mail.gmail.com
2017-12-04 23:02:52 +01:00
xldir = AllocateDir(XLOGDIR);
while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
{
/* Ignore files that are not XLOG segments */
if (!IsXLogFileName(xlde->d_name) &&
!IsPartialXLogFileName(xlde->d_name))
continue;
/*
* We ignore the timeline part of the XLOG segment identifiers in
* deciding whether a segment is still needed. This ensures that we
* won't prematurely remove a segment from a parent timeline. We could
* probably be a little more proactive about removing segments of
* non-parent timelines, but that would be a whole lot more
* complicated.
*
* We use the alphanumeric sorting property of the filenames to decide
* which ones are earlier than the lastoff segment.
*/
if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
{
if (XLogArchiveCheckDone(xlde->d_name))
{
/* Update the last removed location in shared memory first */
UpdateLastRemovedPtr(xlde->d_name);
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
RemoveXlogFile(xlde->d_name, recycleSegNo, &endlogSegNo,
insertTLI);
}
}
}
FreeDir(xldir);
}
/*
* Remove WAL files that are not part of the given timeline's history.
*
* This is called during recovery, whenever we switch to follow a new
* timeline, and at the end of recovery when we create a new timeline. We
* wouldn't otherwise care about extra WAL files lying in pg_wal, but they
* might be leftover pre-allocated or recycled WAL segments on the old timeline
* that we haven't used yet, and contain garbage. If we just leave them in
* pg_wal, they will eventually be archived, and we can't let that happen.
* Files that belong to our timeline history are valid, because we have
* successfully replayed them, but from others we can't be sure.
*
* 'switchpoint' is the current point in WAL where we switch to new timeline,
* and 'newTLI' is the new timeline we switch to.
*/
void
RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
{
DIR *xldir;
struct dirent *xlde;
char switchseg[MAXFNAMELEN];
XLogSegNo endLogSegNo;
XLogSegNo switchLogSegNo;
XLogSegNo recycleSegNo;
/*
* Initialize info about where to begin the work. This will recycle,
* somewhat arbitrarily, 10 future segments.
*/
XLByteToPrevSeg(switchpoint, switchLogSegNo, wal_segment_size);
XLByteToSeg(switchpoint, endLogSegNo, wal_segment_size);
recycleSegNo = endLogSegNo + 10;
/*
* Construct a filename of the last segment to be kept.
*/
XLogFileName(switchseg, newTLI, switchLogSegNo, wal_segment_size);
elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
switchseg);
Clean up assorted messiness around AllocateDir() usage. This patch fixes a couple of low-probability bugs that could lead to reporting an irrelevant errno value (and hence possibly a wrong SQLSTATE) concerning directory-open or file-open failures. It also fixes places where we took shortcuts in reporting such errors, either by using elog instead of ereport or by using ereport but forgetting to specify an errcode. And it eliminates a lot of just plain redundant error-handling code. In service of all this, export fd.c's formerly-static function ReadDirExtended, so that external callers can make use of the coding pattern dir = AllocateDir(path); while ((de = ReadDirExtended(dir, path, LOG)) != NULL) if they'd like to treat directory-open failures as mere LOG conditions rather than errors. Also fix FreeDir to be a no-op if we reach it with dir == NULL, as such a coding pattern would cause. Then, remove code at many call sites that was throwing an error or log message for AllocateDir failure, as ReadDir or ReadDirExtended can handle that job just fine. Aside from being a net code savings, this gets rid of a lot of not-quite-up-to-snuff reports, as mentioned above. (In some places these changes result in replacing a custom error message such as "could not open tablespace directory" with more generic wording "could not open directory", but it was agreed that the custom wording buys little as long as we report the directory name.) In some other call sites where we can't just remove code, change the error reports to be fully project-style-compliant. Also reorder code in restoreTwoPhaseData that was acquiring a lock between AllocateDir and ReadDir; in the unlikely but surely not impossible case that LWLockAcquire changes errno, AllocateDir failures would be misreported. There is no great value in opening the directory before acquiring TwoPhaseStateLock, so just do it in the other order. Also fix CheckXLogRemoved to guarantee that it preserves errno, as quite a number of call sites are implicitly assuming. (Again, it's unlikely but I think not impossible that errno could change during a SpinLockAcquire. If so, this function was broken for its own purposes as well as breaking callers.) And change a few places that were using not-per-project-style messages, such as "could not read directory" when "could not open directory" is more correct. Back-patch the exporting of ReadDirExtended, in case we have occasion to back-patch some fix that makes use of it; it's not needed right now but surely making it global is pretty harmless. Also back-patch the restoreTwoPhaseData and CheckXLogRemoved fixes. The rest of this is essentially cosmetic and need not get back-patched. Michael Paquier, with a bit of additional work by me Discussion: https://postgr.es/m/CAB7nPqRpOCxjiirHmebEFhXVTK7V5Jvw4bz82p7Oimtsm3TyZA@mail.gmail.com
2017-12-04 23:02:52 +01:00
xldir = AllocateDir(XLOGDIR);
while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
{
/* Ignore files that are not XLOG segments */
if (!IsXLogFileName(xlde->d_name))
continue;
/*
* Remove files that are on a timeline older than the new one we're
* switching to, but with a segment number >= the first segment on the
* new timeline.
*/
if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
strcmp(xlde->d_name + 8, switchseg + 8) > 0)
{
/*
* If the file has already been marked as .ready, however, don't
* remove it yet. It should be OK to remove it - files that are
* not part of our timeline history are not required for recovery
* - but seems safer to let them be archived and removed later.
*/
if (!XLogArchiveIsReady(xlde->d_name))
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
RemoveXlogFile(xlde->d_name, recycleSegNo, &endLogSegNo,
newTLI);
}
}
FreeDir(xldir);
}
/*
* Recycle or remove a log file that's no longer needed.
*
* segname is the name of the segment to recycle or remove. recycleSegNo
* is the segment number to recycle up to. endlogSegNo is the segment
* number of the current (or recent) end of WAL.
*
* endlogSegNo gets incremented if the segment is recycled so as it is not
* checked again with future callers of this function.
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
*
* insertTLI is the current timeline for XLOG insertion. Any recycled segments
* should be used for this timeline.
*/
static void
RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo,
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogSegNo *endlogSegNo, TimeLineID insertTLI)
{
char path[MAXPGPATH];
#ifdef WIN32
char newpath[MAXPGPATH];
#endif
struct stat statbuf;
snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
/*
* Before deleting the file, see if it can be recycled as a future log
* segment. Only recycle normal files, because we don't want to recycle
* symbolic links pointing to a separate archive directory.
*/
if (wal_recycle &&
*endlogSegNo <= recycleSegNo &&
XLogCtl->InstallXLogFileSegmentActive && /* callee rechecks this */
lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
InstallXLogFileSegment(endlogSegNo, path,
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
true, recycleSegNo, insertTLI))
{
ereport(DEBUG2,
(errmsg_internal("recycled write-ahead log file \"%s\"",
segname)));
CheckpointStats.ckpt_segs_recycled++;
/* Needn't recheck that slot on future iterations */
(*endlogSegNo)++;
}
else
{
/* No need for any more future segments, or recycling failed ... */
int rc;
ereport(DEBUG2,
(errmsg_internal("removing write-ahead log file \"%s\"",
segname)));
#ifdef WIN32
2015-05-24 03:35:49 +02:00
/*
* On Windows, if another process (e.g another backend) holds the file
* open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
* will still show up in directory listing until the last handle is
* closed. To avoid confusing the lingering deleted file for a live
* WAL file that needs to be archived, rename it before deleting it.
*
* If another process holds the file open without FILE_SHARE_DELETE
* flag, rename will fail. We'll try again at the next checkpoint.
*/
snprintf(newpath, MAXPGPATH, "%s.deleted", path);
if (rename(path, newpath) != 0)
{
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not rename file \"%s\": %m",
path)));
return;
}
rc = durable_unlink(newpath, LOG);
#else
rc = durable_unlink(path, LOG);
#endif
if (rc != 0)
{
/* Message already logged by durable_unlink() */
return;
}
CheckpointStats.ckpt_segs_removed++;
}
2004-08-29 07:07:03 +02:00
XLogArchiveCleanup(segname);
}
/*
* Verify whether pg_wal and pg_wal/archive_status exist.
* If the latter does not exist, recreate it.
*
* It is not the goal of this function to verify the contents of these
* directories, but to help in cases where someone has performed a cluster
* copy for PITR purposes but omitted pg_wal from the copy.
*
* We could also recreate pg_wal if it doesn't exist, but a deliberate
* policy decision was made not to. It is fairly common for pg_wal to be
* a symlink, and if that was the DBA's intent then automatically making a
* plain directory would result in degraded performance with no notice.
*/
static void
ValidateXLOGDirectoryStructure(void)
{
char path[MAXPGPATH];
struct stat stat_buf;
/* Check for pg_wal; if it doesn't exist, error out */
if (stat(XLOGDIR, &stat_buf) != 0 ||
!S_ISDIR(stat_buf.st_mode))
ereport(FATAL,
(errmsg("required WAL directory \"%s\" does not exist",
XLOGDIR)));
/* Check for archive_status */
snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
if (stat(path, &stat_buf) == 0)
{
/* Check for weird cases where it exists but isn't a directory */
if (!S_ISDIR(stat_buf.st_mode))
ereport(FATAL,
(errmsg("required WAL directory \"%s\" does not exist",
path)));
}
else
{
ereport(LOG,
(errmsg("creating missing WAL directory \"%s\"", path)));
if (MakePGDirectory(path) < 0)
ereport(FATAL,
(errmsg("could not create missing directory \"%s\": %m",
path)));
}
}
/*
* Remove previous backup history files. This also retries creation of
* .ready files for any backup history files for which XLogArchiveNotify
* failed earlier.
*/
static void
CleanupBackupHistory(void)
{
DIR *xldir;
struct dirent *xlde;
char path[MAXPGPATH + sizeof(XLOGDIR)];
xldir = AllocateDir(XLOGDIR);
while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
{
if (IsBackupHistoryFileName(xlde->d_name))
{
if (XLogArchiveCheckDone(xlde->d_name))
{
elog(DEBUG2, "removing WAL backup history file \"%s\"",
xlde->d_name);
snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
unlink(path);
XLogArchiveCleanup(xlde->d_name);
}
}
}
FreeDir(xldir);
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* I/O routines for pg_control
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*
* *ControlFile is a buffer in shared memory that holds an image of the
* contents of pg_control. WriteControlFile() initializes pg_control
* given a preloaded buffer, ReadControlFile() loads the buffer from
* the pg_control file (during postmaster or standalone-backend startup),
* and UpdateControlFile() rewrites pg_control after we modify xlog state.
* InitControlFile() fills the buffer with initial values.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*
* For simplicity, WriteControlFile() initializes the fields of pg_control
* that are related to checking backend/database compatibility, and
* ReadControlFile() verifies they are correct. We could split out the
* I/O and compatibility-check functions, but there seems no need currently.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
static void
InitControlFile(uint64 sysidentifier)
{
char mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
/*
* Generate a random nonce. This is used for authentication requests that
* will fail because the user does not exist. The nonce is used to create
* a genuine-looking password challenge for the non-existent user, in lieu
* of an actual stored password.
*/
if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
ereport(PANIC,
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg("could not generate secret authorization token")));
memset(ControlFile, 0, sizeof(ControlFileData));
/* Initialize pg_control status fields */
ControlFile->system_identifier = sysidentifier;
memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
ControlFile->state = DB_SHUTDOWNED;
ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
/* Set important parameter values for use when replaying WAL */
ControlFile->MaxConnections = MaxConnections;
ControlFile->max_worker_processes = max_worker_processes;
ControlFile->max_wal_senders = max_wal_senders;
ControlFile->max_prepared_xacts = max_prepared_xacts;
ControlFile->max_locks_per_xact = max_locks_per_xact;
ControlFile->wal_level = wal_level;
ControlFile->wal_log_hints = wal_log_hints;
ControlFile->track_commit_timestamp = track_commit_timestamp;
ControlFile->data_checksum_version = bootstrap_data_checksum_version;
}
static void
WriteControlFile(void)
{
int fd;
char buffer[PG_CONTROL_FILE_SIZE]; /* need not be aligned */
/*
* Ensure that the size of the pg_control data structure is sane. See the
* comments for these symbols in pg_control.h.
*/
StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
"pg_control is too large for atomic disk writes");
StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
"sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
/*
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
* Initialize version and compatibility-check fields
*/
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
ControlFile->pg_control_version = PG_CONTROL_VERSION;
ControlFile->catalog_version_no = CATALOG_VERSION_NO;
ControlFile->maxAlign = MAXIMUM_ALIGNOF;
ControlFile->floatFormat = FLOATFORMAT_VALUE;
ControlFile->blcksz = BLCKSZ;
ControlFile->relseg_size = RELSEG_SIZE;
ControlFile->xlog_blcksz = XLOG_BLCKSZ;
ControlFile->xlog_seg_size = wal_segment_size;
ControlFile->nameDataLen = NAMEDATALEN;
ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
ControlFile->loblksize = LOBLKSIZE;
ControlFile->float8ByVal = FLOAT8PASSBYVAL;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/* Contents are protected with a CRC */
INIT_CRC32C(ControlFile->crc);
COMP_CRC32C(ControlFile->crc,
(char *) ControlFile,
offsetof(ControlFileData, crc));
FIN_CRC32C(ControlFile->crc);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
* the excess over sizeof(ControlFileData). This reduces the odds of
* premature-EOF errors when reading pg_control. We'll still fail when we
* check the contents of the file, but hopefully with a more specific
* error than "couldn't read pg_control".
*/
memset(buffer, 0, PG_CONTROL_FILE_SIZE);
memcpy(buffer, ControlFile, sizeof(ControlFileData));
fd = BasicOpenFile(XLOG_CONTROL_FILE,
O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
if (fd < 0)
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not create file \"%s\": %m",
XLOG_CONTROL_FILE)));
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
{
/* if write didn't set errno, assume problem is no disk space */
if (errno == 0)
errno = ENOSPC;
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not write to file \"%s\": %m",
XLOG_CONTROL_FILE)));
}
pgstat_report_wait_end();
pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
if (pg_fsync(fd) != 0)
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m",
XLOG_CONTROL_FILE)));
pgstat_report_wait_end();
if (close(fd) != 0)
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not close file \"%s\": %m",
XLOG_CONTROL_FILE)));
}
static void
ReadControlFile(void)
{
pg_crc32c crc;
int fd;
static char wal_segsz_str[20];
int r;
/*
* Read data...
*/
fd = BasicOpenFile(XLOG_CONTROL_FILE,
O_RDWR | PG_BINARY);
if (fd < 0)
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m",
XLOG_CONTROL_FILE)));
pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
r = read(fd, ControlFile, sizeof(ControlFileData));
if (r != sizeof(ControlFileData))
{
if (r < 0)
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m",
XLOG_CONTROL_FILE)));
else
ereport(PANIC,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("could not read file \"%s\": read %d of %zu",
XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
}
pgstat_report_wait_end();
close(fd);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Check for expected pg_control format version. If this is wrong, the
* CRC check will likely fail because we'll be checking the wrong number
* of bytes. Complaining about wrong version will probably be more
* enlightening than complaining about wrong CRC.
*/
if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
" but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
ControlFile->pg_control_version, ControlFile->pg_control_version,
PG_CONTROL_VERSION, PG_CONTROL_VERSION),
errhint("This could be a problem of mismatched byte ordering. It looks like you need to initdb.")));
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
" but the server was compiled with PG_CONTROL_VERSION %d.",
ControlFile->pg_control_version, PG_CONTROL_VERSION),
errhint("It looks like you need to initdb.")));
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/* Now check the CRC. */
INIT_CRC32C(crc);
COMP_CRC32C(crc,
(char *) ControlFile,
offsetof(ControlFileData, crc));
FIN_CRC32C(crc);
if (!EQ_CRC32C(crc, ControlFile->crc))
ereport(FATAL,
(errmsg("incorrect checksum in control file")));
/*
2009-02-07 11:49:36 +01:00
* Do compatibility checking immediately. If the database isn't
* compatible with the backend executable, we want to abort before we can
* possibly do any damage.
*/
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
" but the server was compiled with CATALOG_VERSION_NO %d.",
ControlFile->catalog_version_no, CATALOG_VERSION_NO),
errhint("It looks like you need to initdb.")));
if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with MAXALIGN %d,"
" but the server was compiled with MAXALIGN %d.",
ControlFile->maxAlign, MAXIMUM_ALIGNOF),
errhint("It looks like you need to initdb.")));
if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
2005-10-29 02:31:52 +02:00
errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
errhint("It looks like you need to initdb.")));
if (ControlFile->blcksz != BLCKSZ)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with BLCKSZ %d,"
" but the server was compiled with BLCKSZ %d.",
ControlFile->blcksz, BLCKSZ),
errhint("It looks like you need to recompile or initdb.")));
if (ControlFile->relseg_size != RELSEG_SIZE)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
" but the server was compiled with RELSEG_SIZE %d.",
ControlFile->relseg_size, RELSEG_SIZE),
errhint("It looks like you need to recompile or initdb.")));
if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
" but the server was compiled with XLOG_BLCKSZ %d.",
ControlFile->xlog_blcksz, XLOG_BLCKSZ),
errhint("It looks like you need to recompile or initdb.")));
if (ControlFile->nameDataLen != NAMEDATALEN)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with NAMEDATALEN %d,"
" but the server was compiled with NAMEDATALEN %d.",
ControlFile->nameDataLen, NAMEDATALEN),
errhint("It looks like you need to recompile or initdb.")));
if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
" but the server was compiled with INDEX_MAX_KEYS %d.",
ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
errhint("It looks like you need to recompile or initdb.")));
if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
" but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
errhint("It looks like you need to recompile or initdb.")));
if (ControlFile->loblksize != LOBLKSIZE)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with LOBLKSIZE %d,"
" but the server was compiled with LOBLKSIZE %d.",
ControlFile->loblksize, (int) LOBLKSIZE),
errhint("It looks like you need to recompile or initdb.")));
#ifdef USE_FLOAT8_BYVAL
if (ControlFile->float8ByVal != true)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
" but the server was compiled with USE_FLOAT8_BYVAL."),
errhint("It looks like you need to recompile or initdb.")));
#else
if (ControlFile->float8ByVal != false)
ereport(FATAL,
(errmsg("database files are incompatible with server"),
errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
" but the server was compiled without USE_FLOAT8_BYVAL."),
errhint("It looks like you need to recompile or initdb.")));
#endif
wal_segment_size = ControlFile->xlog_seg_size;
if (!IsValidWalSegSize(wal_segment_size))
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
"WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
wal_segment_size,
wal_segment_size)));
snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
PGC_S_OVERRIDE);
/* check and update variables dependent on wal_segment_size */
if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\"")));
if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\"")));
UsableBytesInSegment =
(wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
(SizeOfXLogLongPHD - SizeOfXLogShortPHD);
CalculateCheckpointSegments();
/* Make the initdb settings visible as GUC variables, too */
SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
PGC_INTERNAL, PGC_S_OVERRIDE);
}
/*
* Utility wrapper to update the control file. Note that the control
* file gets flushed.
*/
static void
UpdateControlFile(void)
{
Unified logging system for command-line programs This unifies the various ad hoc logging (message printing, error printing) systems used throughout the command-line programs. Features: - Program name is automatically prefixed. - Message string does not end with newline. This removes a common source of inconsistencies and omissions. - Additionally, a final newline is automatically stripped, simplifying use of PQerrorMessage() etc., another common source of mistakes. - I converted error message strings to use %m where possible. - As a result of the above several points, more translatable message strings can be shared between different components and between frontends and backend, without gratuitous punctuation or whitespace differences. - There is support for setting a "log level". This is not meant to be user-facing, but can be used internally to implement debug or verbose modes. - Lazy argument evaluation, so no significant overhead if logging at some level is disabled. - Some color in the messages, similar to gcc and clang. Set PG_COLOR=auto to try it out. Some colors are predefined, but can be customized by setting PG_COLORS. - Common files (common/, fe_utils/, etc.) can handle logging much more simply by just using one API without worrying too much about the context of the calling program, requiring callbacks, or having to pass "progname" around everywhere. - Some programs called setvbuf() to make sure that stderr is unbuffered, even on Windows. But not all programs did that. This is now done centrally. Soft goals: - Reduces vertical space use and visual complexity of error reporting in the source code. - Encourages more deliberate classification of messages. For example, in some cases it wasn't clear without analyzing the surrounding code whether a message was meant as an error or just an info. - Concepts and terms are vaguely aligned with popular logging frameworks such as log4j and Python logging. This is all just about printing stuff out. Nothing affects program flow (e.g., fatal exits). The uses are just too varied to do that. Some existing code had wrappers that do some kind of print-and-exit, and I adapted those. I tried to keep the output mostly the same, but there is a lot of historical baggage to unwind and special cases to consider, and I might not always have succeeded. One significant change is that pg_rewind used to write all error messages to stdout. That is now changed to stderr. Reviewed-by: Donald Dong <xdong@csumb.edu> Reviewed-by: Arthur Zakirov <a.zakirov@postgrespro.ru> Discussion: https://www.postgresql.org/message-id/flat/6a609b43-4f57-7348-6480-bd022f924310@2ndquadrant.com
2019-04-01 14:24:37 +02:00
update_controlfile(DataDir, ControlFile, true);
}
/*
* Returns the unique system identifier from control file.
*/
uint64
GetSystemIdentifier(void)
{
Assert(ControlFile != NULL);
return ControlFile->system_identifier;
}
Support SCRAM-SHA-256 authentication (RFC 5802 and 7677). This introduces a new generic SASL authentication method, similar to the GSS and SSPI methods. The server first tells the client which SASL authentication mechanism to use, and then the mechanism-specific SASL messages are exchanged in AuthenticationSASLcontinue and PasswordMessage messages. Only SCRAM-SHA-256 is supported at the moment, but this allows adding more SASL mechanisms in the future, without changing the overall protocol. Support for channel binding, aka SCRAM-SHA-256-PLUS is left for later. The SASLPrep algorithm, for pre-processing the password, is not yet implemented. That could cause trouble, if you use a password with non-ASCII characters, and a client library that does implement SASLprep. That will hopefully be added later. Authorization identities, as specified in the SCRAM-SHA-256 specification, are ignored. SET SESSION AUTHORIZATION provides more or less the same functionality, anyway. If a user doesn't exist, perform a "mock" authentication, by constructing an authentic-looking challenge on the fly. The challenge is derived from a new system-wide random value, "mock authentication nonce", which is created at initdb, and stored in the control file. We go through these motions, in order to not give away the information on whether the user exists, to unauthenticated users. Bumps PG_CONTROL_VERSION, because of the new field in control file. Patch by Michael Paquier and Heikki Linnakangas, reviewed at different stages by Robert Haas, Stephen Frost, David Steele, Aleksander Alekseev, and many others. Discussion: https://www.postgresql.org/message-id/CAB7nPqRbR3GmFYdedCAhzukfKrgBLTLtMvENOmPrVWREsZkF8g%40mail.gmail.com Discussion: https://www.postgresql.org/message-id/CAB7nPqSMXU35g%3DW9X74HVeQp0uvgJxvYOuA4A-A3M%2B0wfEBv-w%40mail.gmail.com Discussion: https://www.postgresql.org/message-id/55192AFE.6080106@iki.fi
2017-03-07 13:25:40 +01:00
/*
* Returns the random nonce from control file.
*/
char *
GetMockAuthenticationNonce(void)
{
Assert(ControlFile != NULL);
return ControlFile->mock_authentication_nonce;
}
/*
* Are checksums enabled for data pages?
*/
bool
DataChecksumsEnabled(void)
{
Assert(ControlFile != NULL);
return (ControlFile->data_checksum_version > 0);
}
/*
* Returns a fake LSN for unlogged relations.
*
* Each call generates an LSN that is greater than any previous value
* returned. The current counter value is saved and restored across clean
* shutdowns, but like unlogged relations, does not survive a crash. This can
* be used in lieu of real LSN values returned by XLogInsert, if you need an
* LSN-like increasing sequence of numbers without writing any WAL.
*/
XLogRecPtr
GetFakeLSNForUnloggedRel(void)
{
XLogRecPtr nextUnloggedLSN;
/* increment the unloggedLSN counter, need SpinLock */
SpinLockAcquire(&XLogCtl->ulsn_lck);
nextUnloggedLSN = XLogCtl->unloggedLSN++;
SpinLockRelease(&XLogCtl->ulsn_lck);
return nextUnloggedLSN;
}
/*
* Auto-tune the number of XLOG buffers.
*
* The preferred setting for wal_buffers is about 3% of shared_buffers, with
* a maximum of one XLOG segment (there is little reason to think that more
* is helpful, at least so long as we force an fsync when switching log files)
* and a minimum of 8 blocks (which was the default value prior to PostgreSQL
* 9.1, when auto-tuning was added).
*
* This should not be called until NBuffers has received its final value.
*/
static int
XLOGChooseNumBuffers(void)
{
int xbuffers;
xbuffers = NBuffers / 32;
if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
xbuffers = (wal_segment_size / XLOG_BLCKSZ);
if (xbuffers < 8)
xbuffers = 8;
return xbuffers;
}
/*
* GUC check_hook for wal_buffers
*/
bool
check_wal_buffers(int *newval, void **extra, GucSource source)
{
/*
* -1 indicates a request for auto-tune.
*/
if (*newval == -1)
{
/*
* If we haven't yet changed the boot_val default of -1, just let it
* be. We'll fix it when XLOGShmemSize is called.
*/
if (XLOGbuffers == -1)
return true;
/* Otherwise, substitute the auto-tune value */
*newval = XLOGChooseNumBuffers();
}
/*
* We clamp manually-set values to at least 4 blocks. Prior to PostgreSQL
* 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
* the case, we just silently treat such values as a request for the
* minimum. (We could throw an error instead, but that doesn't seem very
* helpful.)
*/
if (*newval < 4)
*newval = 4;
return true;
}
/*
* Read the control file, set respective GUCs.
*
* This is to be called during startup, including a crash recovery cycle,
* unless in bootstrap mode, where no control file yet exists. As there's no
* usable shared memory yet (its sizing can depend on the contents of the
2018-02-16 12:46:41 +01:00
* control file!), first store the contents in local memory. XLOGShmemInit()
* will then copy it to shared memory later.
*
* reset just controls whether previous contents are to be expected (in the
* reset case, there's a dangling pointer into old shared memory), or not.
*/
void
LocalProcessControlFile(bool reset)
{
Assert(reset || ControlFile == NULL);
ControlFile = palloc(sizeof(ControlFileData));
ReadControlFile();
}
/*
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
* Initialization of shared memory for XLOG
*/
Size
XLOGShmemSize(void)
{
Size size;
/*
* If the value of wal_buffers is -1, use the preferred auto-tune value.
* This isn't an amazingly clean place to do this, but we must wait till
* NBuffers has received its final value, and must do it before using the
* value of XLOGbuffers to do anything important.
*/
if (XLOGbuffers == -1)
{
char buf[32];
snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
}
Assert(XLOGbuffers > 0);
/* XLogCtl */
size = sizeof(XLogCtlData);
/* WAL insertion locks, plus alignment */
size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
/* xlblocks array */
size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
/* extra alignment padding for XLOG I/O buffers */
size = add_size(size, XLOG_BLCKSZ);
/* and the buffers themselves */
size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
/*
* Note: we don't count ControlFileData, it comes out of the "slop factor"
* added by CreateSharedMemoryAndSemaphores. This lets us use this
* routine again below to compute the actual allocation size.
*/
return size;
}
void
XLOGShmemInit(void)
{
bool foundCFile,
foundXLog;
char *allocptr;
int i;
ControlFileData *localControlFile;
#ifdef WAL_DEBUG
2015-05-24 03:35:49 +02:00
/*
* Create a memory context for WAL debugging that's exempt from the normal
* "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
* an allocation fails, but wal_debug is not for production use anyway.
*/
if (walDebugCxt == NULL)
{
walDebugCxt = AllocSetContextCreate(TopMemoryContext,
"WAL Debug",
Add macros to make AllocSetContextCreate() calls simpler and safer. I found that half a dozen (nearly 5%) of our AllocSetContextCreate calls had typos in the context-sizing parameters. While none of these led to especially significant problems, they did create minor inefficiencies, and it's now clear that expecting people to copy-and-paste those calls accurately is not a great idea. Let's reduce the risk of future errors by introducing single macros that encapsulate the common use-cases. Three such macros are enough to cover all but two special-purpose contexts; those two calls can be left as-is, I think. While this patch doesn't in itself improve matters for third-party extensions, it doesn't break anything for them either, and they can gradually adopt the simplified notation over time. In passing, change TopMemoryContext to use the default allocation parameters. Formerly it could only be extended 8K at a time. That was probably reasonable when this code was written; but nowadays we create many more contexts than we did then, so that it's not unusual to have a couple hundred K in TopMemoryContext, even without considering various dubious code that sticks other things there. There seems no good reason not to let it use growing blocks like most other contexts. Back-patch to 9.6, mostly because that's still close enough to HEAD that it's easy to do so, and keeping the branches in sync can be expected to avoid some future back-patching pain. The bugs fixed by these changes don't seem to be significant enough to justify fixing them further back. Discussion: <21072.1472321324@sss.pgh.pa.us>
2016-08-27 23:50:38 +02:00
ALLOCSET_DEFAULT_SIZES);
MemoryContextAllowInCriticalSection(walDebugCxt, true);
}
#endif
XLogCtl = (XLogCtlData *)
ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
localControlFile = ControlFile;
ControlFile = (ControlFileData *)
ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
if (foundCFile || foundXLog)
{
/* both should be present or neither */
Assert(foundCFile && foundXLog);
/* Initialize local copy of WALInsertLocks */
WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
if (localControlFile)
pfree(localControlFile);
return;
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
memset(XLogCtl, 0, sizeof(XLogCtlData));
2001-03-22 05:01:46 +01:00
/*
* Already have read control file locally, unless in bootstrap mode. Move
* contents into shared memory.
*/
if (localControlFile)
{
memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
pfree(localControlFile);
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
* multiple of the alignment for same, so no extra alignment padding is
* needed here.
*/
allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
2001-03-22 05:01:46 +01:00
/* WAL insertion locks. Ensure they're aligned to the full padded size */
allocptr += sizeof(WALInsertLockPadded) -
((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
(WALInsertLockPadded *) allocptr;
allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
{
LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Align the start of the page buffers to a full xlog block size boundary.
* This simplifies some calculations in XLOG insertion. It is also
* required for O_DIRECT.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
XLogCtl->pages = allocptr;
memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
* in additional info.)
*/
XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
XLogCtl->InstallXLogFileSegmentActive = false;
XLogCtl->WalWriterSleeping = false;
SpinLockInit(&XLogCtl->Insert.insertpos_lck);
SpinLockInit(&XLogCtl->info_lck);
SpinLockInit(&XLogCtl->ulsn_lck);
}
/*
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
* This func must be called ONCE on system install. It creates pg_control
* and the initial XLOG segment.
*/
void
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
BootStrapXLOG(void)
{
CheckPoint checkPoint;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
char *buffer;
XLogPageHeader page;
XLogLongPageHeader longpage;
XLogRecord *record;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
char *recptr;
uint64 sysidentifier;
struct timeval tv;
pg_crc32c crc;
/* allow ordinary WAL segment creation, like StartupXLOG() would */
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
XLogCtl->InstallXLogFileSegmentActive = true;
LWLockRelease(ControlFileLock);
/*
* Select a hopefully-unique system identifier code for this installation.
* We use the result of gettimeofday(), including the fractional seconds
* field, as being about as unique as we can easily get. (Think not to
* use random(), since it hasn't been seeded and there's no portable way
* to seed it other than the system clock value...) The upper half of the
* uint64 value is just the tv_sec part, while the lower half contains the
* tv_usec part (which must fit in 20 bits), plus 12 bits from our current
* PID for a little extra uniqueness. A person knowing this encoding can
* determine the initialization time of the installation, which could
* perhaps be useful sometimes.
*/
gettimeofday(&tv, NULL);
sysidentifier = ((uint64) tv.tv_sec) << 32;
sysidentifier |= ((uint64) tv.tv_usec) << 12;
sysidentifier |= getpid() & 0xFFF;
/* page buffer must be aligned suitably for O_DIRECT */
buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
memset(page, 0, XLOG_BLCKSZ);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Set up information for the initial checkpoint record
*
* The initial checkpoint record is written to the beginning of the WAL
* segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
* used, so that we can use 0/0 to mean "before any valid WAL segment".
*/
checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
checkPoint.ThisTimeLineID = BootstrapTimeLineID;
checkPoint.PrevTimeLineID = BootstrapTimeLineID;
checkPoint.fullPageWrites = fullPageWrites;
checkPoint.nextXid =
FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
checkPoint.nextOid = FirstGenbkiObjectId;
checkPoint.nextMulti = FirstMultiXactId;
checkPoint.nextMultiOffset = 0;
checkPoint.oldestXid = FirstNormalTransactionId;
checkPoint.oldestXidDB = TemplateDbOid;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
checkPoint.oldestMulti = FirstMultiXactId;
checkPoint.oldestMultiDB = TemplateDbOid;
checkPoint.oldestCommitTsXid = InvalidTransactionId;
checkPoint.newestCommitTsXid = InvalidTransactionId;
checkPoint.time = (pg_time_t) time(NULL);
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
checkPoint.oldestActiveXid = InvalidTransactionId;
ShmemVariableCache->nextXid = checkPoint.nextXid;
ShmemVariableCache->nextOid = checkPoint.nextOid;
ShmemVariableCache->oidCount = 0;
MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
AdvanceOldestClogXid(checkPoint.oldestXid);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
/* Set up the XLOG page header */
page->xlp_magic = XLOG_PAGE_MAGIC;
page->xlp_info = XLP_LONG_HEADER;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
page->xlp_tli = BootstrapTimeLineID;
page->xlp_pageaddr = wal_segment_size;
longpage = (XLogLongPageHeader) page;
longpage->xlp_sysid = sysidentifier;
longpage->xlp_seg_size = wal_segment_size;
longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
/* Insert the initial checkpoint record */
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
recptr = ((char *) page + SizeOfXLogLongPHD);
record = (XLogRecord *) recptr;
record->xl_prev = 0;
record->xl_xid = InvalidTransactionId;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
record->xl_rmid = RM_XLOG_ID;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
recptr += SizeOfXLogRecord;
/* fill the XLogRecordDataHeaderShort struct */
*(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
*(recptr++) = sizeof(checkPoint);
memcpy(recptr, &checkPoint, sizeof(checkPoint));
recptr += sizeof(checkPoint);
Assert(recptr - (char *) record == record->xl_tot_len);
INIT_CRC32C(crc);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
FIN_CRC32C(crc);
record->xl_crc = crc;
/* Create first XLOG segment file */
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
openLogTLI = BootstrapTimeLineID;
openLogFile = XLogFileInit(1, BootstrapTimeLineID);
Account explicitly for long-lived FDs that are allocated outside fd.c. The comments in fd.c have long claimed that all file allocations should go through that module, but in reality that's not always practical. fd.c doesn't supply APIs for invoking some FD-producing syscalls like pipe() or epoll_create(); and the APIs it does supply for non-virtual FDs are mostly insistent on releasing those FDs at transaction end; and in some cases the actual open() call is in code that can't be made to use fd.c, such as libpq. This has led to a situation where, in a modern server, there are likely to be seven or so long-lived FDs per backend process that are not known to fd.c. Since NUM_RESERVED_FDS is only 10, that meant we had *very* few spare FDs if max_files_per_process is >= the system ulimit and fd.c had opened all the files it thought it safely could. The contrib/postgres_fdw regression test, in particular, could easily be made to fall over by running it under a restrictive ulimit. To improve matters, invent functions Acquire/Reserve/ReleaseExternalFD that allow outside callers to tell fd.c that they have or want to allocate a FD that's not directly managed by fd.c. Add calls to track all the fixed FDs in a standard backend session, so that we are honestly guaranteeing that NUM_RESERVED_FDS FDs remain unused below the EMFILE limit in a backend's idle state. The coding rules for these functions say that there's no need to call them in code that just allocates one FD over a fairly short interval; we can dip into NUM_RESERVED_FDS for such cases. That means that there aren't all that many places where we need to worry. But postgres_fdw and dblink must use this facility to account for long-lived FDs consumed by libpq connections. There may be other places where it's worth doing such accounting, too, but this seems like enough to solve the immediate problem. Internally to fd.c, "external" FDs are limited to max_safe_fds/3 FDs. (Callers can choose to ignore this limit, but of course it's unwise to do so except for fixed file allocations.) I also reduced the limit on "allocated" files to max_safe_fds/3 FDs (it had been max_safe_fds/2). Conceivably a smarter rule could be used here --- but in practice, on reasonable systems, max_safe_fds should be large enough that this isn't much of an issue, so KISS for now. To avoid possible regression in the number of external or allocated files that can be opened, increase FD_MINFREE and the lower limit on max_files_per_process a little bit; we now insist that the effective "ulimit -n" be at least 64. This seems like pretty clearly a bug fix, but in view of the lack of field complaints, I'll refrain from risking a back-patch. Discussion: https://postgr.es/m/E1izCmM-0005pV-Co@gemulon.postgresql.org
2020-02-24 23:28:33 +01:00
/*
* We needn't bother with Reserve/ReleaseExternalFD here, since we'll
* close the file again in a moment.
*/
/* Write the first page with the initial record */
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
{
/* if write didn't set errno, assume problem is no disk space */
if (errno == 0)
errno = ENOSPC;
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not write bootstrap write-ahead log file: %m")));
}
pgstat_report_wait_end();
pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
if (pg_fsync(openLogFile) != 0)
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not fsync bootstrap write-ahead log file: %m")));
pgstat_report_wait_end();
if (close(openLogFile) != 0)
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not close bootstrap write-ahead log file: %m")));
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
openLogFile = -1;
/* Now create pg_control */
InitControlFile(sysidentifier);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
ControlFile->time = checkPoint.time;
ControlFile->checkPoint = checkPoint.redo;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
ControlFile->checkPointCopy = checkPoint;
/* some additional ControlFile fields are set in WriteControlFile() */
WriteControlFile();
/* Bootstrap the commit log, too */
BootStrapCLOG();
BootStrapCommitTs();
BootStrapSUBTRANS();
BootStrapMultiXact();
pfree(buffer);
/*
* Force control file to be read - in contrast to normal processing we'd
* otherwise never run the checks and GUC related initializations therein.
*/
ReadControlFile();
}
static char *
str_time(pg_time_t tnow)
{
static char buf[128];
pg_strftime(buf, sizeof(buf),
"%Y-%m-%d %H:%M:%S %Z",
pg_localtime(&tnow, log_timezone));
return buf;
}
/*
* Initialize the first WAL segment on new timeline.
*/
static void
XLogInitNewTimeline(TimeLineID endTLI, XLogRecPtr endOfLog, TimeLineID newTLI)
{
char xlogfname[MAXFNAMELEN];
XLogSegNo endLogSegNo;
XLogSegNo startLogSegNo;
/* we always switch to a new timeline after archive recovery */
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
Assert(endTLI != newTLI);
/*
* Update min recovery point one last time.
*/
UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
/*
* Calculate the last segment on the old timeline, and the first segment
* on the new timeline. If the switch happens in the middle of a segment,
* they are the same, but if the switch happens exactly at a segment
* boundary, startLogSegNo will be endLogSegNo + 1.
*/
XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
/*
* Initialize the starting WAL segment for the new timeline. If the switch
* happens in the middle of a segment, copy data from the last WAL segment
* of the old timeline up to the switch point, to the starting WAL segment
* on the new timeline.
*/
if (endLogSegNo == startLogSegNo)
{
/*
* Make a copy of the file on the new timeline.
*
* Writing WAL isn't allowed yet, so there are no locking
* considerations. But we should be just as tense as XLogFileInit to
* avoid emplacing a bogus file.
*/
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogFileCopy(newTLI, endLogSegNo, endTLI, endLogSegNo,
XLogSegmentOffset(endOfLog, wal_segment_size));
}
else
{
/*
* The switch happened at a segment boundary, so just create the next
* segment on the new timeline.
*/
int fd;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
fd = XLogFileInit(startLogSegNo, newTLI);
if (close(fd) != 0)
{
char xlogfname[MAXFNAMELEN];
int save_errno = errno;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
errno = save_errno;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not close file \"%s\": %m", xlogfname)));
}
}
/*
* Let's just make real sure there are not .ready or .done flags posted
* for the new segment.
*/
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
XLogArchiveCleanup(xlogfname);
}
/*
* Perform cleanup actions at the conclusion of archive recovery.
*/
static void
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
TimeLineID newTLI)
{
/*
* Execute the recovery_end_command, if any.
*/
if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
ExecuteRecoveryCommand(recoveryEndCommand,
"recovery_end_command",
true,
WAIT_EVENT_RECOVERY_END_COMMAND);
/*
* We switched to a new timeline. Clean up segments on the old timeline.
*
* If there are any higher-numbered segments on the old timeline, remove
* them. They might contain valid WAL, but they might also be
* pre-allocated files containing garbage. In any case, they are not part
* of the new timeline's history so we don't need them.
*/
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
RemoveNonParentXlogFiles(EndOfLog, newTLI);
/*
* If the switch happened in the middle of a segment, what to do with the
* last, partial segment on the old timeline? If we don't archive it, and
* the server that created the WAL never archives it either (e.g. because
* it was hit by a meteor), it will never make it to the archive. That's
* OK from our point of view, because the new segment that we created with
* the new TLI contains all the WAL from the old timeline up to the switch
* point. But if you later try to do PITR to the "missing" WAL on the old
* timeline, recovery won't find it in the archive. It's physically
* present in the new file with new TLI, but recovery won't look there
* when it's recovering to the older timeline. On the other hand, if we
* archive the partial segment, and the original server on that timeline
* is still running and archives the completed version of the same segment
* later, it will fail. (We used to do that in 9.4 and below, and it
* caused such problems).
*
* As a compromise, we rename the last segment with the .partial suffix,
* and archive it. Archive recovery will never try to read .partial
* segments, so they will normally go unused. But in the odd PITR case,
* the administrator can copy them manually to the pg_wal directory
* (removing the suffix). They can be useful in debugging, too.
*
* If a .done or .ready file already exists for the old timeline, however,
* we had already determined that the segment is complete, so we can let
* it be archived normally. (In particular, if it was restored from the
* archive to begin with, it's expected to have a .done file).
*/
if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
XLogArchivingActive())
{
char origfname[MAXFNAMELEN];
XLogSegNo endLogSegNo;
XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
if (!XLogArchiveIsReadyOrDone(origfname))
{
char origpath[MAXPGPATH];
char partialfname[MAXFNAMELEN];
char partialpath[MAXPGPATH];
XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
/*
* Make sure there's no .done or .ready file for the .partial
* file.
*/
XLogArchiveCleanup(partialfname);
durable_rename(origpath, partialpath, ERROR);
XLogArchiveNotify(partialfname);
}
}
}
/*
* Check to see if required parameters are set high enough on this server
* for various aspects of recovery operation.
*
* Note that all the parameters which this function tests need to be
* listed in Administrator's Overview section in high-availability.sgml.
* If you change them, don't forget to update the list.
*/
static void
CheckRequiredParameterValues(void)
{
/*
* For archive recovery, the WAL must be generated with at least 'replica'
* wal_level.
*/
if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
{
ereport(FATAL,
(errmsg("WAL was generated with wal_level=minimal, cannot continue recovering"),
errdetail("This happens if you temporarily set wal_level=minimal on the server."),
errhint("Use a backup taken after setting wal_level to higher than minimal.")));
Refactor checking whether we've reached the recovery target. Makes the replay loop slightly more readable, by separating the concerns of whether to stop and whether to delay, and how to extract the timestamp from a record. This has the user-visible change that the timestamp of the last applied record is now updated after actually applying it. Before, it was updated just before applying it. That meant that pg_last_xact_replay_timestamp() could return the timestamp of a commit record that is in process of being replayed, but not yet applied. Normally the difference is small, but if min_recovery_apply_delay is set, there could be a significant delay between reading a record and applying it. Another behavioral change is that if you recover to a restore point, we stop after the restore point record, not before it. It makes no difference as far as running queries on the server is concerned, as applying a restore point record changes nothing, but if examine the timeline history you will see that the new timeline branched off just after the restore point record, not before it. One practical consequence is that if you do PITR to the new timeline, and set recovery target to the same named restore point again, it will find and stop recovery at the same restore point. Conceptually, I think it makes more sense to consider the restore point as part of the new timeline's history than not. In principle, setting the last-replayed timestamp before actually applying the record was a bug all along, but it doesn't seem worth the risk to backpatch, since min_recovery_apply_delay was only added in 9.4.
2014-01-09 13:00:39 +01:00
}
/*
* For Hot Standby, the WAL must be generated with 'replica' mode, and we
* must have at least as many backend slots as the primary.
*/
if (ArchiveRecoveryRequested && EnableHotStandby)
{
/* We ignore autovacuum_max_workers when we make this test. */
RecoveryRequiresIntParameter("max_connections",
MaxConnections,
ControlFile->MaxConnections);
RecoveryRequiresIntParameter("max_worker_processes",
max_worker_processes,
ControlFile->max_worker_processes);
2019-02-12 02:07:56 +01:00
RecoveryRequiresIntParameter("max_wal_senders",
max_wal_senders,
ControlFile->max_wal_senders);
RecoveryRequiresIntParameter("max_prepared_transactions",
max_prepared_xacts,
ControlFile->max_prepared_xacts);
RecoveryRequiresIntParameter("max_locks_per_transaction",
max_locks_per_xact,
ControlFile->max_locks_per_xact);
}
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
}
/*
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
* This must be called ONCE during postmaster or standalone-backend startup
*/
void
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
StartupXLOG(void)
{
XLogCtlInsert *Insert;
CheckPoint checkPoint;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
bool wasShutdown;
bool haveTblspcMap;
bool haveBackupLabel;
XLogRecPtr EndOfLog;
TimeLineID EndOfLogTLI;
TimeLineID newTLI;
bool performedWalRecovery;
EndOfWalRecoveryInfo *endOfRecoveryInfo;
XLogRecPtr abortedRecPtr;
XLogRecPtr missingContrecPtr;
TransactionId oldestActiveXID;
bool promoted = false;
Use a ResourceOwner to track buffer pins in all cases. Historically, we've allowed auxiliary processes to take buffer pins without tracking them in a ResourceOwner. However, that creates problems for error recovery. In particular, we've seen multiple reports of assertion crashes in the startup process when it gets an error while holding a buffer pin, as for example if it gets ENOSPC during a write. In a non-assert build, the process would simply exit without releasing the pin at all. We've gotten away with that so far just because a failure exit of the startup process translates to a database crash anyhow; but any similar behavior in other aux processes could result in stuck pins and subsequent problems in vacuum. To improve this, institute a policy that we must *always* have a resowner backing any attempt to pin a buffer, which we can enforce just by removing the previous special-case code in resowner.c. Add infrastructure to make it easy to create a process-lifespan AuxProcessResourceOwner and clear out its contents at appropriate times. Replace existing ad-hoc resowner management in bgwriter.c and other aux processes with that. (Thus, while the startup process gains a resowner where it had none at all before, some other aux process types are replacing an ad-hoc resowner with this code.) Also use the AuxProcessResourceOwner to manage buffer pins taken during StartupXLOG and ShutdownXLOG, even when those are being run in a bootstrap process or a standalone backend rather than a true auxiliary process. In passing, remove some other ad-hoc resource owner creations that had gotten cargo-culted into various other places. As far as I can tell that was all unnecessary, and if it had been necessary it was incomplete, due to lacking any provision for clearing those resowners later. (Also worth noting in this connection is that a process that hasn't called InitBufferPoolBackend has no business accessing buffers; so there's more to do than just add the resowner if we want to touch buffers in processes not covered by this patch.) Although this fixes a very old bug, no back-patch, because there's no evidence of any significant problem in non-assert builds. Patch by me, pursuant to a report from Justin Pryzby. Thanks to Robert Haas and Kyotaro Horiguchi for reviews. Discussion: https://postgr.es/m/20180627233939.GA10276@telsasoft.com
2018-07-18 18:15:16 +02:00
/*
* We should have an aux process resource owner to use, and we should not
* be in a transaction that's installed some other resowner.
*/
Assert(AuxProcessResourceOwner != NULL);
Assert(CurrentResourceOwner == NULL ||
CurrentResourceOwner == AuxProcessResourceOwner);
CurrentResourceOwner = AuxProcessResourceOwner;
/*
* Check that contents look valid.
*/
if (!XRecOffIsValid(ControlFile->checkPoint))
ereport(FATAL,
(errmsg("control file contains invalid checkpoint location")));
switch (ControlFile->state)
{
case DB_SHUTDOWNED:
/*
* This is the expected case, so don't be chatty in standalone
* mode
*/
ereport(IsPostmasterEnvironment ? LOG : NOTICE,
(errmsg("database system was shut down at %s",
str_time(ControlFile->time))));
break;
case DB_SHUTDOWNED_IN_RECOVERY:
ereport(LOG,
(errmsg("database system was shut down in recovery at %s",
str_time(ControlFile->time))));
break;
case DB_SHUTDOWNING:
ereport(LOG,
(errmsg("database system shutdown was interrupted; last known up at %s",
str_time(ControlFile->time))));
break;
case DB_IN_CRASH_RECOVERY:
ereport(LOG,
(errmsg("database system was interrupted while in recovery at %s",
str_time(ControlFile->time)),
errhint("This probably means that some data is corrupted and"
" you will have to use the last backup for recovery.")));
break;
case DB_IN_ARCHIVE_RECOVERY:
ereport(LOG,
(errmsg("database system was interrupted while in recovery at log time %s",
str_time(ControlFile->checkPointCopy.time)),
errhint("If this has occurred more than once some data might be corrupted"
" and you might need to choose an earlier recovery target.")));
break;
case DB_IN_PRODUCTION:
ereport(LOG,
(errmsg("database system was interrupted; last known up at %s",
str_time(ControlFile->time))));
break;
default:
ereport(FATAL,
(errmsg("control file contains invalid database cluster state")));
}
/* This is just to allow attaching to startup process with a debugger */
#ifdef XLOG_REPLAY_DELAY
if (ControlFile->state != DB_SHUTDOWNED)
pg_usleep(60000000L);
#endif
/*
* Verify that pg_wal and pg_wal/archive_status exist. In cases where
* someone has performed a copy for PITR, these directories may have been
* excluded and need to be re-created.
*/
ValidateXLOGDirectoryStructure();
/* Set up timeout handler needed to report startup progress. */
if (!IsBootstrapProcessingMode())
RegisterTimeout(STARTUP_PROGRESS_TIMEOUT,
startup_progress_timeout_handler);
/*----------
* If we previously crashed, perform a couple of actions:
*
* - The pg_wal directory may still include some temporary WAL segments
* used when creating a new segment, so perform some clean up to not
* bloat this path. This is done first as there is no point to sync
* this temporary data.
*
* - There might be data which we had written, intending to fsync it, but
* which we had not actually fsync'd yet. Therefore, a power failure in
* the near future might cause earlier unflushed writes to be lost, even
* though more recent data written to disk from here on would be
* persisted. To avoid that, fsync the entire data directory.
*/
if (ControlFile->state != DB_SHUTDOWNED &&
ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
{
RemoveTempXlogFiles();
SyncDataDirectory();
}
/*
* Prepare for WAL recovery if needed.
*
* InitWalRecovery analyzes the control file and the backup label file, if
* any. It updates the in-memory ControlFile buffer according to the
* starting checkpoint, and sets InRecovery and ArchiveRecoveryRequested.
* It also applies the tablespace map file, if any.
*/
InitWalRecovery(ControlFile, &wasShutdown,
&haveBackupLabel, &haveTblspcMap);
checkPoint = ControlFile->checkPointCopy;
/* initialize shared memory variables from the checkpoint record */
ShmemVariableCache->nextXid = checkPoint.nextXid;
ShmemVariableCache->nextOid = checkPoint.nextOid;
ShmemVariableCache->oidCount = 0;
MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
AdvanceOldestClogXid(checkPoint.oldestXid);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
SetCommitTsLimit(checkPoint.oldestCommitTsXid,
checkPoint.newestCommitTsXid);
XLogCtl->ckptFullXid = checkPoint.nextXid;
/*
* Clear out any old relcache cache files. This is *necessary* if we do
* any WAL replay, since that would probably result in the cache files
* being out of sync with database reality. In theory we could leave them
* in place if the database had been cleanly shut down, but it seems
* safest to just remove them always and let them be rebuilt during the
* first backend startup. These files needs to be removed from all
* directories including pg_tblspc, however the symlinks are created only
* after reading tablespace_map file in case of archive recovery from
* backup, so needs to clear old relcache files here after creating
* symlinks.
*/
RelationCacheInitFileRemove();
/*
* Initialize replication slots, before there's a chance to remove
* required resources.
*/
StartupReplicationSlots();
/*
* Startup logical state, needs to be setup now so we have proper data
* during crash recovery.
*/
StartupReorderBuffer();
/*
* Startup CLOG. This must be done after ShmemVariableCache->nextXid has
* been initialized and before we accept connections or begin WAL replay.
*/
StartupCLOG();
/*
Rework the way multixact truncations work. The fact that multixact truncations are not WAL logged has caused a fair share of problems. Amongst others it requires to do computations during recovery while the database is not in a consistent state, delaying truncations till checkpoints, and handling members being truncated, but offset not. We tried to put bandaids on lots of these issues over the last years, but it seems time to change course. Thus this patch introduces WAL logging for multixact truncations. This allows: 1) to perform the truncation directly during VACUUM, instead of delaying it to the checkpoint. 2) to avoid looking at the offsets SLRU for truncation during recovery, we can just use the master's values. 3) simplify a fair amount of logic to keep in memory limits straight, this has gotten much easier During the course of fixing this a bunch of additional bugs had to be fixed: 1) Data was not purged from memory the member's SLRU before deleting segments. This happened to be hard or impossible to hit due to the interlock between checkpoints and truncation. 2) find_multixact_start() relied on SimpleLruDoesPhysicalPageExist - but that doesn't work for offsets that haven't yet been flushed to disk. Add code to flush the SLRUs to fix. Not pretty, but it feels slightly safer to only make decisions based on actual on-disk state. 3) find_multixact_start() could be called concurrently with a truncation and thus fail. Via SetOffsetVacuumLimit() that could lead to a round of emergency vacuuming. The problem remains in pg_get_multixact_members(), but that's quite harmless. For now this is going to only get applied to 9.5+, leaving the issues in the older branches in place. It is quite possible that we need to backpatch at a later point though. For the case this gets backpatched we need to handle that an updated standby may be replaying WAL from a not-yet upgraded primary. We have to recognize that situation and use "old style" truncation (i.e. looking at the SLRUs) during WAL replay. In contrast to before, this now happens in the startup process, when replaying a checkpoint record, instead of the checkpointer. Doing truncation in the restartpoint is incorrect, they can happen much later than the original checkpoint, thereby leading to wraparound. To avoid "multixact_redo: unknown op code 48" errors standbys would have to be upgraded before primaries. A later patch will bump the WAL page magic, and remove the legacy truncation codepaths. Legacy truncation support is just included to make a possible future backpatch easier. Discussion: 20150621192409.GA4797@alap3.anarazel.de Reviewed-By: Robert Haas, Alvaro Herrera, Thomas Munro Backpatch: 9.5 for now
2015-09-26 19:04:25 +02:00
* Startup MultiXact. We need to do this early to be able to replay
* truncations.
*/
StartupMultiXact();
/*
* Ditto for commit timestamps. Activate the facility if the setting is
* enabled in the control file, as there should be no tracking of commit
* timestamps done when the setting was disabled. This facility can be
* started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
*/
if (ControlFile->track_commit_timestamp)
StartupCommitTs();
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
/*
* Recover knowledge about replay progress of known replication partners.
*/
StartupReplicationOrigin();
/*
* Initialize unlogged LSN. On a clean shutdown, it's restored from the
* control file. On recovery, all unlogged relations are blown away, so
* the unlogged LSN counter can be reset too.
*/
if (ControlFile->state == DB_SHUTDOWNED)
XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
else
XLogCtl->unloggedLSN = FirstNormalUnloggedLSN;
Fix more issues with cascading replication and timeline switches. When a standby server follows the master using WAL archive, and it chooses a new timeline (recovery_target_timeline='latest'), it only fetches the timeline history file for the chosen target timeline, not any other history files that might be missing from pg_xlog. For example, if the current timeline is 2, and we choose 4 as the new recovery target timeline, the history file for timeline 3 is not fetched, even if it's part of this server's history. That's enough for the standby itself - the history file for timeline 4 includes timeline 3 as well - but if a cascading standby server wants to recover to timeline 3, it needs the history file. To fix, when a new recovery target timeline is chosen, try to copy any missing history files from the archive to pg_xlog between the old and new target timeline. A second similar issue was with the WAL files. When a standby recovers from archive, and it reaches a segment that contains a switch to a new timeline, recovery fetches only the WAL file labelled with the new timeline's ID. The file from the new timeline contains a copy of the WAL from the old timeline up to the point where the switch happened, and recovery recovers it from the new file. But in streaming replication, walsender only tries to read it from the old timeline's file. To fix, change walsender to read it from the new file, so that it behaves the same as recovery in that sense, and doesn't try to open the possibly nonexistent file with the old timeline's ID.
2013-01-23 09:01:04 +01:00
/*
* Copy any missing timeline history files between 'now' and the recovery
* target timeline from archive to pg_wal. While we don't need those files
Fix more issues with cascading replication and timeline switches. When a standby server follows the master using WAL archive, and it chooses a new timeline (recovery_target_timeline='latest'), it only fetches the timeline history file for the chosen target timeline, not any other history files that might be missing from pg_xlog. For example, if the current timeline is 2, and we choose 4 as the new recovery target timeline, the history file for timeline 3 is not fetched, even if it's part of this server's history. That's enough for the standby itself - the history file for timeline 4 includes timeline 3 as well - but if a cascading standby server wants to recover to timeline 3, it needs the history file. To fix, when a new recovery target timeline is chosen, try to copy any missing history files from the archive to pg_xlog between the old and new target timeline. A second similar issue was with the WAL files. When a standby recovers from archive, and it reaches a segment that contains a switch to a new timeline, recovery fetches only the WAL file labelled with the new timeline's ID. The file from the new timeline contains a copy of the WAL from the old timeline up to the point where the switch happened, and recovery recovers it from the new file. But in streaming replication, walsender only tries to read it from the old timeline's file. To fix, change walsender to read it from the new file, so that it behaves the same as recovery in that sense, and doesn't try to open the possibly nonexistent file with the old timeline's ID.
2013-01-23 09:01:04 +01:00
* ourselves - the history file of the recovery target timeline covers all
* the previous timelines in the history too - a cascading standby server
* might be interested in them. Or, if you archive the WAL from this
* server to a different archive than the primary, it'd be good for all
Fix more issues with cascading replication and timeline switches. When a standby server follows the master using WAL archive, and it chooses a new timeline (recovery_target_timeline='latest'), it only fetches the timeline history file for the chosen target timeline, not any other history files that might be missing from pg_xlog. For example, if the current timeline is 2, and we choose 4 as the new recovery target timeline, the history file for timeline 3 is not fetched, even if it's part of this server's history. That's enough for the standby itself - the history file for timeline 4 includes timeline 3 as well - but if a cascading standby server wants to recover to timeline 3, it needs the history file. To fix, when a new recovery target timeline is chosen, try to copy any missing history files from the archive to pg_xlog between the old and new target timeline. A second similar issue was with the WAL files. When a standby recovers from archive, and it reaches a segment that contains a switch to a new timeline, recovery fetches only the WAL file labelled with the new timeline's ID. The file from the new timeline contains a copy of the WAL from the old timeline up to the point where the switch happened, and recovery recovers it from the new file. But in streaming replication, walsender only tries to read it from the old timeline's file. To fix, change walsender to read it from the new file, so that it behaves the same as recovery in that sense, and doesn't try to open the possibly nonexistent file with the old timeline's ID.
2013-01-23 09:01:04 +01:00
* the history files to get archived there after failover, so that you can
* use one of the old timelines as a PITR target. Timeline history files
* are small, so it's better to copy them unnecessarily than not copy them
* and regret later.
*/
restoreTimeLineHistoryFiles(checkPoint.ThisTimeLineID, recoveryTargetTLI);
Fix more issues with cascading replication and timeline switches. When a standby server follows the master using WAL archive, and it chooses a new timeline (recovery_target_timeline='latest'), it only fetches the timeline history file for the chosen target timeline, not any other history files that might be missing from pg_xlog. For example, if the current timeline is 2, and we choose 4 as the new recovery target timeline, the history file for timeline 3 is not fetched, even if it's part of this server's history. That's enough for the standby itself - the history file for timeline 4 includes timeline 3 as well - but if a cascading standby server wants to recover to timeline 3, it needs the history file. To fix, when a new recovery target timeline is chosen, try to copy any missing history files from the archive to pg_xlog between the old and new target timeline. A second similar issue was with the WAL files. When a standby recovers from archive, and it reaches a segment that contains a switch to a new timeline, recovery fetches only the WAL file labelled with the new timeline's ID. The file from the new timeline contains a copy of the WAL from the old timeline up to the point where the switch happened, and recovery recovers it from the new file. But in streaming replication, walsender only tries to read it from the old timeline's file. To fix, change walsender to read it from the new file, so that it behaves the same as recovery in that sense, and doesn't try to open the possibly nonexistent file with the old timeline's ID.
2013-01-23 09:01:04 +01:00
/*
* Before running in recovery, scan pg_twophase and fill in its status to
* be able to work on entries generated by redo. Doing a scan before
* taking any recovery action has the merit to discard any 2PC files that
* are newer than the first record to replay, saving from any conflicts at
* replay. This avoids as well any subsequent scans when doing recovery
* of the on-disk two-phase data.
*/
restoreTwoPhaseData();
lastFullPageWrites = checkPoint.fullPageWrites;
RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
doPageWrites = lastFullPageWrites;
2000-10-21 17:43:36 +02:00
2000-10-28 18:21:00 +02:00
/* REDO */
if (InRecovery)
{
/* Initialize state for RecoveryInProgress() */
SpinLockAcquire(&XLogCtl->info_lck);
if (InArchiveRecovery)
XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
else
XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
SpinLockRelease(&XLogCtl->info_lck);
/*
* Update pg_control to show that we are recovering and to show the
* selected checkpoint as the place we are starting from. We also mark
* pg_control with any minimum recovery stop point obtained from a
* backup history file.
*
* No need to hold ControlFileLock yet, we aren't up far enough.
*/
UpdateControlFile();
/*
* If there was a backup label file, it's done its job and the info
* has now been propagated into pg_control. We must get rid of the
* label file so that if we crash during recovery, we'll pick up at
* the latest recovery restartpoint instead of going all the way back
* to the backup start point. It seems prudent though to just rename
* the file out of the way rather than delete it completely.
*/
if (haveBackupLabel)
{
unlink(BACKUP_LABEL_OLD);
durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
}
2010-02-26 03:01:40 +01:00
/*
* If there was a tablespace_map file, it's done its job and the
* symlinks have been created. We must get rid of the map file so
* that if we crash during recovery, we don't create symlinks again.
* It seems prudent though to just rename the file out of the way
* rather than delete it completely.
*/
if (haveTblspcMap)
{
unlink(TABLESPACE_MAP_OLD);
durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
}
Prevent references to invalid relation pages after fresh promotion If a standby crashes after promotion before having completed its first post-recovery checkpoint, then the minimal recovery point which marks the LSN position where the cluster is able to reach consistency may be set to a position older than the first end-of-recovery checkpoint while all the WAL available should be replayed. This leads to the instance thinking that it contains inconsistent pages, causing a PANIC and a hard instance crash even if all the WAL available has not been replayed for certain sets of records replayed. When in crash recovery, minRecoveryPoint is expected to always be set to InvalidXLogRecPtr, which forces the recovery to replay all the WAL available, so this commit makes sure that the local copy of minRecoveryPoint from the control file is initialized properly and stays as it is while crash recovery is performed. Once switching to archive recovery or if crash recovery finishes, then the local copy minRecoveryPoint can be safely updated. Pavan Deolasee has reported and diagnosed the failure in the first place, and the base fix idea to rely on the local copy of minRecoveryPoint comes from Kyotaro Horiguchi, which has been expanded into a full-fledged patch by me. The test included in this commit has been written by Álvaro Herrera and Pavan Deolasee, which I have modified to make it faster and more reliable with sleep phases. Backpatch down to all supported versions where the bug appears, aka 9.3 which is where the end-of-recovery checkpoint is not run by the startup process anymore. The test gets easily supported down to 10, still it has been tested on all branches. Reported-by: Pavan Deolasee Diagnosed-by: Pavan Deolasee Reviewed-by: Pavan Deolasee, Kyotaro Horiguchi Author: Michael Paquier, Kyotaro Horiguchi, Pavan Deolasee, Álvaro Herrera Discussion: https://postgr.es/m/CABOikdPOewjNL=05K5CbNMxnNtXnQjhTx2F--4p4ruorCjukbA@mail.gmail.com
2018-07-05 03:46:18 +02:00
/*
* Initialize our local copy of minRecoveryPoint. When doing crash
* recovery we want to replay up to the end of WAL. Particularly, in
* the case of a promoted standby minRecoveryPoint value in the
* control file is only updated after the first checkpoint. However,
* if the instance crashes before the first post-recovery checkpoint
* is completed then recovery will use a stale location causing the
* startup process to think that there are still invalid page
* references when checking for data consistency.
*/
if (InArchiveRecovery)
{
LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
Prevent references to invalid relation pages after fresh promotion If a standby crashes after promotion before having completed its first post-recovery checkpoint, then the minimal recovery point which marks the LSN position where the cluster is able to reach consistency may be set to a position older than the first end-of-recovery checkpoint while all the WAL available should be replayed. This leads to the instance thinking that it contains inconsistent pages, causing a PANIC and a hard instance crash even if all the WAL available has not been replayed for certain sets of records replayed. When in crash recovery, minRecoveryPoint is expected to always be set to InvalidXLogRecPtr, which forces the recovery to replay all the WAL available, so this commit makes sure that the local copy of minRecoveryPoint from the control file is initialized properly and stays as it is while crash recovery is performed. Once switching to archive recovery or if crash recovery finishes, then the local copy minRecoveryPoint can be safely updated. Pavan Deolasee has reported and diagnosed the failure in the first place, and the base fix idea to rely on the local copy of minRecoveryPoint comes from Kyotaro Horiguchi, which has been expanded into a full-fledged patch by me. The test included in this commit has been written by Álvaro Herrera and Pavan Deolasee, which I have modified to make it faster and more reliable with sleep phases. Backpatch down to all supported versions where the bug appears, aka 9.3 which is where the end-of-recovery checkpoint is not run by the startup process anymore. The test gets easily supported down to 10, still it has been tested on all branches. Reported-by: Pavan Deolasee Diagnosed-by: Pavan Deolasee Reviewed-by: Pavan Deolasee, Kyotaro Horiguchi Author: Michael Paquier, Kyotaro Horiguchi, Pavan Deolasee, Álvaro Herrera Discussion: https://postgr.es/m/CABOikdPOewjNL=05K5CbNMxnNtXnQjhTx2F--4p4ruorCjukbA@mail.gmail.com
2018-07-05 03:46:18 +02:00
}
else
{
LocalMinRecoveryPoint = InvalidXLogRecPtr;
LocalMinRecoveryPointTLI = 0;
Prevent references to invalid relation pages after fresh promotion If a standby crashes after promotion before having completed its first post-recovery checkpoint, then the minimal recovery point which marks the LSN position where the cluster is able to reach consistency may be set to a position older than the first end-of-recovery checkpoint while all the WAL available should be replayed. This leads to the instance thinking that it contains inconsistent pages, causing a PANIC and a hard instance crash even if all the WAL available has not been replayed for certain sets of records replayed. When in crash recovery, minRecoveryPoint is expected to always be set to InvalidXLogRecPtr, which forces the recovery to replay all the WAL available, so this commit makes sure that the local copy of minRecoveryPoint from the control file is initialized properly and stays as it is while crash recovery is performed. Once switching to archive recovery or if crash recovery finishes, then the local copy minRecoveryPoint can be safely updated. Pavan Deolasee has reported and diagnosed the failure in the first place, and the base fix idea to rely on the local copy of minRecoveryPoint comes from Kyotaro Horiguchi, which has been expanded into a full-fledged patch by me. The test included in this commit has been written by Álvaro Herrera and Pavan Deolasee, which I have modified to make it faster and more reliable with sleep phases. Backpatch down to all supported versions where the bug appears, aka 9.3 which is where the end-of-recovery checkpoint is not run by the startup process anymore. The test gets easily supported down to 10, still it has been tested on all branches. Reported-by: Pavan Deolasee Diagnosed-by: Pavan Deolasee Reviewed-by: Pavan Deolasee, Kyotaro Horiguchi Author: Michael Paquier, Kyotaro Horiguchi, Pavan Deolasee, Álvaro Herrera Discussion: https://postgr.es/m/CABOikdPOewjNL=05K5CbNMxnNtXnQjhTx2F--4p4ruorCjukbA@mail.gmail.com
2018-07-05 03:46:18 +02:00
}
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/*
* Reset pgstat data, because it may be invalid after recovery.
*/
pgstat_reset_all();
/* Check that the GUCs used to generate the WAL allow recovery */
CheckRequiredParameterValues();
/*
2012-04-16 14:36:40 +02:00
* We're in recovery, so unlogged relations may be trashed and must be
* reset. This should be done BEFORE allowing Hot Standby
* connections, so that read-only backends don't try to read whatever
* garbage is left over from before.
*/
ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
/*
* Likewise, delete any saved transaction snapshot files that got left
* behind by crashed backends.
*/
DeleteAllExportedSnapshotFiles();
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
/*
* Initialize for Hot Standby, if enabled. We won't let backends in
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
* yet, not until we've reached the min recovery point specified in
* control file and we've established a recovery snapshot from a
* running-xacts WAL record.
*/
If recovery.conf is created after "pg_ctl stop -m i", do crash recovery. If you create a base backup using an atomic filesystem snapshot, and try to perform PITR starting from that base backup, or if you just kill a master server and create recovery.conf to put it into standby mode, we don't know how far we need to recover before reaching consistency. Normally in crash recovery, we replay all the WAL present in pg_xlog, and assume that we're consistent after that. And normally in archive recovery, minRecoveryPoint, backupEndRequired, or backupEndPoint is set in the control file, indicating how far we need to replay to reach consistency. But if the server was previously up and running normally, and you kill -9 it or take an atomic filesystem snapshot, none of those fields are set in the control file. The solution is to perform crash recovery first, replaying all the WAL in pg_xlog. After that's done, we assume that the system is consistent like in normal crash recovery, and switch to archive recovery mode after that. Per report from Kyotaro HORIGUCHI. In his scenario, recovery.conf was created after "pg_ctl stop -m i". I'm not sure we need to support that exact scenario, but we should support backing up using a filesystem snapshot, which looks identical. This issue goes back to at least 9.0, where hot standby was introduced and we started to track when consistency is reached. In 9.1 and 9.2, we would open up for hot standby too early, and queries could briefly see an inconsistent state. But 9.2 made it more visible, as we started to PANIC if we see a reference to a non-existing page during recovery, if we've already reached consistency. This is a fairly big patch, so back-patch to 9.2 only, where the issue is more visible. We can consider back-patching further after this has received some more testing in 9.2 and master.
2013-02-22 10:43:04 +01:00
if (ArchiveRecoveryRequested && EnableHotStandby)
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
{
TransactionId *xids;
int nxids;
ereport(DEBUG1,
(errmsg_internal("initializing for hot standby")));
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
InitRecoveryTransactionEnvironment();
if (wasShutdown)
oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
else
oldestActiveXID = checkPoint.oldestActiveXid;
Assert(TransactionIdIsValid(oldestActiveXID));
/* Tell procarray about the range of xids it has to deal with */
ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextXid));
/*
* Startup subtrans only. CLOG, MultiXact and commit timestamp
* have already been started up and other SLRUs are not maintained
* during recovery and need not be started yet.
*/
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
StartupSUBTRANS(oldestActiveXID);
/*
* If we're beginning at a shutdown checkpoint, we know that
* nothing was running on the primary at this point. So fake-up an
* empty running-xacts record and use that here and now. Recover
* additional standby state for prepared transactions.
*/
if (wasShutdown)
{
RunningTransactionsData running;
TransactionId latestCompletedXid;
/*
* Construct a RunningTransactions snapshot representing a
* shut down server, with only prepared transactions still
* alive. We're never overflowed at this point because all
* subxids are listed with their parent prepared transactions.
*/
running.xcnt = nxids;
running.subxcnt = 0;
running.subxid_overflow = false;
running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
running.oldestRunningXid = oldestActiveXID;
latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
TransactionIdRetreat(latestCompletedXid);
Assert(TransactionIdIsNormal(latestCompletedXid));
running.latestCompletedXid = latestCompletedXid;
running.xids = xids;
ProcArrayApplyRecoveryInfo(&running);
StandbyRecoverPreparedTransactions();
}
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
}
/*
* We're all set for replaying the WAL now. Do it.
*/
PerformWalRecovery();
performedWalRecovery = true;
2000-10-28 18:21:00 +02:00
}
else
performedWalRecovery = false;
2000-10-28 18:21:00 +02:00
/*
* Finish WAL recovery.
*/
endOfRecoveryInfo = FinishWalRecovery();
EndOfLog = endOfRecoveryInfo->endOfLog;
EndOfLogTLI = endOfRecoveryInfo->endOfLogTLI;
abortedRecPtr = endOfRecoveryInfo->abortedRecPtr;
missingContrecPtr = endOfRecoveryInfo->missingContrecPtr;
/*
* Complain if we did not roll forward far enough to render the backup
* dump consistent. Note: it is indeed okay to look at the local variable
* LocalMinRecoveryPoint here, even though ControlFile->minRecoveryPoint
* might be further ahead --- ControlFile->minRecoveryPoint cannot have
* been advanced beyond the WAL we processed.
*/
if (InRecovery &&
(EndOfLog < LocalMinRecoveryPoint ||
!XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
{
/*
* Ran off end of WAL before reaching end-of-backup WAL record, or
* minRecoveryPoint. That's usually a bad sign, indicating that you
* tried to recover from an online backup but never called
* pg_stop_backup(), or you didn't archive all the WAL up to that
* point. However, this also happens in crash recovery, if the system
* crashes while an online backup is in progress. We must not treat
* that as an error, or the database will refuse to start up.
*/
If recovery.conf is created after "pg_ctl stop -m i", do crash recovery. If you create a base backup using an atomic filesystem snapshot, and try to perform PITR starting from that base backup, or if you just kill a master server and create recovery.conf to put it into standby mode, we don't know how far we need to recover before reaching consistency. Normally in crash recovery, we replay all the WAL present in pg_xlog, and assume that we're consistent after that. And normally in archive recovery, minRecoveryPoint, backupEndRequired, or backupEndPoint is set in the control file, indicating how far we need to replay to reach consistency. But if the server was previously up and running normally, and you kill -9 it or take an atomic filesystem snapshot, none of those fields are set in the control file. The solution is to perform crash recovery first, replaying all the WAL in pg_xlog. After that's done, we assume that the system is consistent like in normal crash recovery, and switch to archive recovery mode after that. Per report from Kyotaro HORIGUCHI. In his scenario, recovery.conf was created after "pg_ctl stop -m i". I'm not sure we need to support that exact scenario, but we should support backing up using a filesystem snapshot, which looks identical. This issue goes back to at least 9.0, where hot standby was introduced and we started to track when consistency is reached. In 9.1 and 9.2, we would open up for hot standby too early, and queries could briefly see an inconsistent state. But 9.2 made it more visible, as we started to PANIC if we see a reference to a non-existing page during recovery, if we've already reached consistency. This is a fairly big patch, so back-patch to 9.2 only, where the issue is more visible. We can consider back-patching further after this has received some more testing in 9.2 and master.
2013-02-22 10:43:04 +01:00
if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
{
if (ControlFile->backupEndRequired)
ereport(FATAL,
(errmsg("WAL ends before end of online backup"),
errhint("All WAL generated while online backup was taken must be available at recovery.")));
else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
ereport(FATAL,
(errmsg("WAL ends before end of online backup"),
errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
else
ereport(FATAL,
(errmsg("WAL ends before consistent recovery point")));
}
}
/*
* Reset unlogged relations to the contents of their INIT fork. This is
* done AFTER recovery is complete so as to include any unlogged relations
* created during recovery, but BEFORE recovery is marked as having
* completed successfully. Otherwise we'd not retry if any of the post
* end-of-recovery steps fail.
*/
if (InRecovery)
ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
Rework order of end-of-recovery actions to delay timeline history write A critical failure in some of the end-of-recovery actions before the end-of-recovery record is written can cause PostgreSQL to react inconsistently with the rest of the cluster in the event of a crash before the final record is written. Two such failures are for example an error while processing a two-phase state files or when operating on recovery.conf. With this commit, the failures are still considered FATAL, but the write of the timeline history file is delayed as much as possible so as the window between the moment the file is written and the end-of-recovery record is generated gets minimized. This way, in the event of a crash or a failure, the new timeline decided at promotion will not seem taken by other nodes in the cluster. It is not really possible to reduce to zero this window, hence one could still see failures if a crash happens between the history file write and the end-of-recovery record, so any future code should be careful when adding new end-of-recovery actions. The original report from Magnus Hagander mentioned a renamed recovery.conf as original end-of-recovery failure which caused a timeline to be seen as taken but the subsequent processing on the now-missing recovery.conf cause the startup process to issue stop on FATAL, which at follow-up startup made the system inconsistent because of on-disk changes which already happened. Processing of two-phase state files still needs some work as corrupted entries are simply ignored now. This is left as a future item and this commit fixes the original complain. Reported-by: Magnus Hagander Author: Heikki Linnakangas Reviewed-by: Alexander Korotkov, Michael Paquier, David Steele Discussion: https://postgr.es/m/CABUevEz09XY2EevA2dLjPCY-C5UO4Hq=XxmXLmF6ipNFecbShQ@mail.gmail.com
2018-07-09 03:22:34 +02:00
/*
* Pre-scan prepared transactions to find out the range of XIDs present.
* This information is not quite needed yet, but it is positioned here so
* as potential problems are detected before any on-disk change is done.
*/
oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
/*
* Allow ordinary WAL segment creation before possibly switching to a new
* timeline, which creates a new segment, and after the last ReadRecord().
*/
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
XLogCtl->InstallXLogFileSegmentActive = true;
LWLockRelease(ControlFileLock);
/*
* Consider whether we need to assign a new timeline ID.
*
* If we did archive recovery, we always assign a new ID. This handles a
* couple of issues. If we stopped short of the end of WAL during
* recovery, then we are clearly generating a new timeline and must assign
* it a unique new ID. Even if we ran to the end, modifying the current
* last segment is problematic because it may result in trying to
* overwrite an already-archived copy of that segment, and we encourage
* DBAs to make their archive_commands reject that. We can dodge the
* problem by making the new active segment have a new timeline ID.
*
* In a normal crash recovery, we can just extend the timeline we were in.
*/
newTLI = endOfRecoveryInfo->lastRecTLI;
If recovery.conf is created after "pg_ctl stop -m i", do crash recovery. If you create a base backup using an atomic filesystem snapshot, and try to perform PITR starting from that base backup, or if you just kill a master server and create recovery.conf to put it into standby mode, we don't know how far we need to recover before reaching consistency. Normally in crash recovery, we replay all the WAL present in pg_xlog, and assume that we're consistent after that. And normally in archive recovery, minRecoveryPoint, backupEndRequired, or backupEndPoint is set in the control file, indicating how far we need to replay to reach consistency. But if the server was previously up and running normally, and you kill -9 it or take an atomic filesystem snapshot, none of those fields are set in the control file. The solution is to perform crash recovery first, replaying all the WAL in pg_xlog. After that's done, we assume that the system is consistent like in normal crash recovery, and switch to archive recovery mode after that. Per report from Kyotaro HORIGUCHI. In his scenario, recovery.conf was created after "pg_ctl stop -m i". I'm not sure we need to support that exact scenario, but we should support backing up using a filesystem snapshot, which looks identical. This issue goes back to at least 9.0, where hot standby was introduced and we started to track when consistency is reached. In 9.1 and 9.2, we would open up for hot standby too early, and queries could briefly see an inconsistent state. But 9.2 made it more visible, as we started to PANIC if we see a reference to a non-existing page during recovery, if we've already reached consistency. This is a fairly big patch, so back-patch to 9.2 only, where the issue is more visible. We can consider back-patching further after this has received some more testing in 9.2 and master.
2013-02-22 10:43:04 +01:00
if (ArchiveRecoveryRequested)
{
newTLI = findNewestTimeLine(recoveryTargetTLI) + 1;
ereport(LOG,
(errmsg("selected new timeline ID: %u", newTLI)));
/*
* Make a writable copy of the last WAL segment. (Note that we also
* have a copy of the last block of the old WAL in
* endOfRecovery->lastPage; we will use that below.)
*/
XLogInitNewTimeline(EndOfLogTLI, EndOfLog, newTLI);
Rework order of end-of-recovery actions to delay timeline history write A critical failure in some of the end-of-recovery actions before the end-of-recovery record is written can cause PostgreSQL to react inconsistently with the rest of the cluster in the event of a crash before the final record is written. Two such failures are for example an error while processing a two-phase state files or when operating on recovery.conf. With this commit, the failures are still considered FATAL, but the write of the timeline history file is delayed as much as possible so as the window between the moment the file is written and the end-of-recovery record is generated gets minimized. This way, in the event of a crash or a failure, the new timeline decided at promotion will not seem taken by other nodes in the cluster. It is not really possible to reduce to zero this window, hence one could still see failures if a crash happens between the history file write and the end-of-recovery record, so any future code should be careful when adding new end-of-recovery actions. The original report from Magnus Hagander mentioned a renamed recovery.conf as original end-of-recovery failure which caused a timeline to be seen as taken but the subsequent processing on the now-missing recovery.conf cause the startup process to issue stop on FATAL, which at follow-up startup made the system inconsistent because of on-disk changes which already happened. Processing of two-phase state files still needs some work as corrupted entries are simply ignored now. This is left as a future item and this commit fixes the original complain. Reported-by: Magnus Hagander Author: Heikki Linnakangas Reviewed-by: Alexander Korotkov, Michael Paquier, David Steele Discussion: https://postgr.es/m/CABUevEz09XY2EevA2dLjPCY-C5UO4Hq=XxmXLmF6ipNFecbShQ@mail.gmail.com
2018-07-09 03:22:34 +02:00
/*
* Remove the signal files out of the way, so that we don't
* accidentally re-enter archive recovery mode in a subsequent crash.
Rework order of end-of-recovery actions to delay timeline history write A critical failure in some of the end-of-recovery actions before the end-of-recovery record is written can cause PostgreSQL to react inconsistently with the rest of the cluster in the event of a crash before the final record is written. Two such failures are for example an error while processing a two-phase state files or when operating on recovery.conf. With this commit, the failures are still considered FATAL, but the write of the timeline history file is delayed as much as possible so as the window between the moment the file is written and the end-of-recovery record is generated gets minimized. This way, in the event of a crash or a failure, the new timeline decided at promotion will not seem taken by other nodes in the cluster. It is not really possible to reduce to zero this window, hence one could still see failures if a crash happens between the history file write and the end-of-recovery record, so any future code should be careful when adding new end-of-recovery actions. The original report from Magnus Hagander mentioned a renamed recovery.conf as original end-of-recovery failure which caused a timeline to be seen as taken but the subsequent processing on the now-missing recovery.conf cause the startup process to issue stop on FATAL, which at follow-up startup made the system inconsistent because of on-disk changes which already happened. Processing of two-phase state files still needs some work as corrupted entries are simply ignored now. This is left as a future item and this commit fixes the original complain. Reported-by: Magnus Hagander Author: Heikki Linnakangas Reviewed-by: Alexander Korotkov, Michael Paquier, David Steele Discussion: https://postgr.es/m/CABUevEz09XY2EevA2dLjPCY-C5UO4Hq=XxmXLmF6ipNFecbShQ@mail.gmail.com
2018-07-09 03:22:34 +02:00
*/
if (endOfRecoveryInfo->standby_signal_file_found)
durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
if (endOfRecoveryInfo->recovery_signal_file_found)
durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
Rework order of end-of-recovery actions to delay timeline history write A critical failure in some of the end-of-recovery actions before the end-of-recovery record is written can cause PostgreSQL to react inconsistently with the rest of the cluster in the event of a crash before the final record is written. Two such failures are for example an error while processing a two-phase state files or when operating on recovery.conf. With this commit, the failures are still considered FATAL, but the write of the timeline history file is delayed as much as possible so as the window between the moment the file is written and the end-of-recovery record is generated gets minimized. This way, in the event of a crash or a failure, the new timeline decided at promotion will not seem taken by other nodes in the cluster. It is not really possible to reduce to zero this window, hence one could still see failures if a crash happens between the history file write and the end-of-recovery record, so any future code should be careful when adding new end-of-recovery actions. The original report from Magnus Hagander mentioned a renamed recovery.conf as original end-of-recovery failure which caused a timeline to be seen as taken but the subsequent processing on the now-missing recovery.conf cause the startup process to issue stop on FATAL, which at follow-up startup made the system inconsistent because of on-disk changes which already happened. Processing of two-phase state files still needs some work as corrupted entries are simply ignored now. This is left as a future item and this commit fixes the original complain. Reported-by: Magnus Hagander Author: Heikki Linnakangas Reviewed-by: Alexander Korotkov, Michael Paquier, David Steele Discussion: https://postgr.es/m/CABUevEz09XY2EevA2dLjPCY-C5UO4Hq=XxmXLmF6ipNFecbShQ@mail.gmail.com
2018-07-09 03:22:34 +02:00
/*
* Write the timeline history file, and have it archived. After this
* point (or rather, as soon as the file is archived), the timeline
* will appear as "taken" in the WAL archive and to any standby
* servers. If we crash before actually switching to the new
* timeline, standby servers will nevertheless think that we switched
* to the new timeline, and will try to connect to the new timeline.
* To minimize the window for that, try to do as little as possible
* between here and writing the end-of-recovery record.
*/
writeTimeLineHistory(newTLI, recoveryTargetTLI,
EndOfLog, endOfRecoveryInfo->recoveryStopReason);
ereport(LOG,
(errmsg("archive recovery complete")));
}
/* Save the selected TimeLineID in shared memory, too */
XLogCtl->InsertTimeLineID = newTLI;
XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
Fix WAL replay in presence of an incomplete record Physical replication always ships WAL segment files to replicas once they are complete. This is a problem if one WAL record is split across a segment boundary and the primary server crashes before writing down the segment with the next portion of the WAL record: WAL writing after crash recovery would happily resume at the point where the broken record started, overwriting that record ... but any standby or backup may have already received a copy of that segment, and they are not rewinding. This causes standbys to stop following the primary after the latter crashes: LOG: invalid contrecord length 7262 at A8/D9FFFBC8 because the standby is still trying to read the continuation record (contrecord) for the original long WAL record, but it is not there and it will never be. A workaround is to stop the replica, delete the WAL file, and restart it -- at which point a fresh copy is brought over from the primary. But that's pretty labor intensive, and I bet many users would just give up and re-clone the standby instead. A fix for this problem was already attempted in commit 515e3d84a0b5, but it only addressed the case for the scenario of WAL archiving, so streaming replication would still be a problem (as well as other things such as taking a filesystem-level backup while the server is down after having crashed), and it had performance scalability problems too; so it had to be reverted. This commit fixes the problem using an approach suggested by Andres Freund, whereby the initial portion(s) of the split-up WAL record are kept, and a special type of WAL record is written where the contrecord was lost, so that WAL replay in the replica knows to skip the broken parts. With this approach, we can continue to stream/archive segment files as soon as they are complete, and replay of the broken records will proceed across the crash point without a hitch. Because a new type of WAL record is added, users should be careful to upgrade standbys first, primaries later. Otherwise they risk the standby being unable to start if the primary happens to write such a record. A new TAP test that exercises this is added, but the portability of it is yet to be seen. This has been wrong since the introduction of physical replication, so backpatch all the way back. In stable branches, keep the new XLogReaderState members at the end of the struct, to avoid an ABI break. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Nathan Bossart <bossartn@amazon.com> Discussion: https://postgr.es/m/202108232252.dh7uxf6oxwcy@alvherre.pgsql
2021-09-29 16:21:51 +02:00
/*
* Actually, if WAL ended in an incomplete record, skip the parts that
* made it through and start writing after the portion that persisted.
* (It's critical to first write an OVERWRITE_CONTRECORD message, which
* we'll do as soon as we're open for writing new WAL.)
*/
if (!XLogRecPtrIsInvalid(missingContrecPtr))
{
Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
EndOfLog = missingContrecPtr;
}
/*
* Prepare to write WAL starting at EndOfLog location, and init xlog
* buffer cache using the block containing the last record from the
* previous incarnation.
*/
2000-10-28 18:21:00 +02:00
Insert = &XLogCtl->Insert;
Insert->PrevBytePos = XLogRecPtrToBytePos(endOfRecoveryInfo->lastRec);
Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
2001-03-22 05:01:46 +01:00
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Tricky point here: lastPage contains the *last* block that the LastRec
* record spans, not the one it starts in. The last block is indeed the
* one we want to use.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
if (EndOfLog % XLOG_BLCKSZ != 0)
{
char *page;
int len;
int firstIdx;
2000-10-28 18:21:00 +02:00
firstIdx = XLogRecPtrToBufIdx(EndOfLog);
len = EndOfLog - endOfRecoveryInfo->lastPageBeginPtr;
Assert(len < XLOG_BLCKSZ);
2000-10-28 18:21:00 +02:00
/* Copy the valid part of the last block, and zero the rest */
page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
memcpy(page, endOfRecoveryInfo->lastPage, len);
memset(page + len, 0, XLOG_BLCKSZ - len);
XLogCtl->xlblocks[firstIdx] = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
XLogCtl->InitializedUpTo = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
}
else
{
/*
* There is no partial block to copy. Just set InitializedUpTo, and
* let the first attempt to insert a log record to initialize the next
* buffer.
*/
XLogCtl->InitializedUpTo = EndOfLog;
}
LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
XLogCtl->LogwrtResult = LogwrtResult;
XLogCtl->LogwrtRqst.Write = EndOfLog;
XLogCtl->LogwrtRqst.Flush = EndOfLog;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Preallocate additional log files, if wanted.
*/
PreallocXlogFiles(EndOfLog, newTLI);
/*
* Okay, we're officially UP.
*/
2000-10-28 18:21:00 +02:00
InRecovery = false;
/* start the archive_timeout timer and LSN running */
XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
XLogCtl->lastSegSwitchLSN = EndOfLog;
/* also initialize latestCompletedXid, to nextXid - 1 */
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid);
LWLockRelease(ProcArrayLock);
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
/*
* Start up subtrans, if not already done for hot standby. (commit
* timestamps are started below, if necessary.)
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
*/
if (standbyState == STANDBY_DISABLED)
StartupSUBTRANS(oldestActiveXID);
/*
* Perform end of recovery actions for any SLRUs that need it.
*/
TrimCLOG();
TrimMultiXact();
/* Reload shared-memory state for prepared transactions */
RecoverPreparedTransactions();
/* Shut down xlogreader */
ShutdownWalRecovery();
/* Enable WAL writes for this backend only. */
LocalSetXLogInsertAllowed();
/* If necessary, write overwrite-contrecord before doing anything else */
if (!XLogRecPtrIsInvalid(abortedRecPtr))
{
Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI);
}
/*
* Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
* record before resource manager writes cleanup WAL records or checkpoint
* record is written.
*/
Insert->fullPageWrites = lastFullPageWrites;
UpdateFullPageWrites();
/*
* Emit checkpoint or end-of-recovery record in XLOG, if required.
*/
if (performedWalRecovery)
promoted = PerformRecoveryXLogAction();
/*
* If any of the critical GUCs have changed, log them before we allow
* backends to write WAL.
*/
XLogReportParameters();
/* If this is archive recovery, perform post-recovery cleanup actions. */
if (ArchiveRecoveryRequested)
CleanupAfterArchiveRecovery(EndOfLogTLI, EndOfLog, newTLI);
/*
* Local WAL inserts enabled, so it's time to finish initialization of
* commit timestamp.
*/
CompleteCommitTsInitialization();
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/*
* All done with end-of-recovery actions.
*
* Now allow backends to write WAL and update the control file status in
* consequence. SharedRecoveryState, that controls if backends can write
* WAL, is updated while holding ControlFileLock to prevent other backends
* to look at an inconsistent state of the control file in shared memory.
* There is still a small window during which backends can write WAL and
* the control file is still referring to a system not in DB_IN_PRODUCTION
* state while looking at the on-disk control file.
*
* Also, we use info_lck to update SharedRecoveryState to ensure that
* there are no race conditions concerning visibility of other recent
* updates to shared memory.
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
*/
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->state = DB_IN_PRODUCTION;
SpinLockAcquire(&XLogCtl->info_lck);
XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
SpinLockRelease(&XLogCtl->info_lck);
Allow a streaming replication standby to follow a timeline switch. Before this patch, streaming replication would refuse to start replicating if the timeline in the primary doesn't exactly match the standby. The situation where it doesn't match is when you have a master, and two standbys, and you promote one of the standbys to become new master. Promoting bumps up the timeline ID, and after that bump, the other standby would refuse to continue. There's significantly more timeline related logic in streaming replication now. First of all, when a standby connects to primary, it will ask the primary for any timeline history files that are missing from the standby. The missing files are sent using a new replication command TIMELINE_HISTORY, and stored in standby's pg_xlog directory. Using the timeline history files, the standby can follow the latest timeline present in the primary (recovery_target_timeline='latest'), just as it can follow new timelines appearing in an archive directory. START_REPLICATION now takes a TIMELINE parameter, to specify exactly which timeline to stream WAL from. This allows the standby to request the primary to send over WAL that precedes the promotion. The replication protocol is changed slightly (in a backwards-compatible way although there's little hope of streaming replication working across major versions anyway), to allow replication to stop when the end of timeline reached, putting the walsender back into accepting a replication command. Many thanks to Amit Kapila for testing and reviewing various versions of this patch.
2012-12-13 18:00:00 +01:00
UpdateControlFile();
LWLockRelease(ControlFileLock);
Fix snapshot builds during promotion of hot standby node with 2PC Some specific logic is done at the end of recovery when involving 2PC transactions: 1) Call RecoverPreparedTransactions(), to recover the state of 2PC transactions into memory (re-acquire locks, etc.). 2) ShutdownRecoveryTransactionEnvironment(), to move back to normal operations, mainly cleaning up recovery locks and KnownAssignedXids (including any 2PC transaction tracked previously). 3) Switch XLogCtl->SharedRecoveryState to RECOVERY_STATE_DONE, which is the tipping point for any process calling RecoveryInProgress() to check if the cluster is still in recovery or not. Any snapshot taken between steps 2) and 3) would be empty, causing any transaction relying on a snapshot at this point to potentially corrupt data as there could still be some 2PC transactions to track, with RecentXmin moving backwards on successive calls to GetSnapshotData() in the same transaction. As SharedRecoveryState is the point to take into account to know if it is safe to discard KnownAssignedXids, this commit moves step 2) after step 3), so as we can never finish with empty snapshots. This exists since the introduction of hot standby, so backpatch all the way down. The window with incorrect snapshots is extremely small, but I have seen it when running 023_pitr_prepared_xact.pl, as did buildfarm member fairywren. Thomas Munro also found it independently. Special thanks to Andres Freund for taking the time to analyze this issue. Reported-by: Thomas Munro, Michael Paquier Analyzed-by: Andres Freund Discussion: https://postgr.es/m/20210422203603.fdnh3fu2mmfp2iov@alap3.anarazel.de Backpatch-through: 9.6
2021-10-04 07:05:20 +02:00
/*
* Shutdown the recovery environment. This must occur after
* RecoverPreparedTransactions() (see notes in lock_twophase_recover())
* and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
* any session building a snapshot will not rely on KnownAssignedXids as
* RecoveryInProgress() would return false at this stage. This is
* particularly critical for prepared 2PC transactions, that would still
* need to be included in snapshots once recovery has ended.
*/
if (standbyState != STANDBY_DISABLED)
ShutdownRecoveryTransactionEnvironment();
Allow a streaming replication standby to follow a timeline switch. Before this patch, streaming replication would refuse to start replicating if the timeline in the primary doesn't exactly match the standby. The situation where it doesn't match is when you have a master, and two standbys, and you promote one of the standbys to become new master. Promoting bumps up the timeline ID, and after that bump, the other standby would refuse to continue. There's significantly more timeline related logic in streaming replication now. First of all, when a standby connects to primary, it will ask the primary for any timeline history files that are missing from the standby. The missing files are sent using a new replication command TIMELINE_HISTORY, and stored in standby's pg_xlog directory. Using the timeline history files, the standby can follow the latest timeline present in the primary (recovery_target_timeline='latest'), just as it can follow new timelines appearing in an archive directory. START_REPLICATION now takes a TIMELINE parameter, to specify exactly which timeline to stream WAL from. This allows the standby to request the primary to send over WAL that precedes the promotion. The replication protocol is changed slightly (in a backwards-compatible way although there's little hope of streaming replication working across major versions anyway), to allow replication to stop when the end of timeline reached, putting the walsender back into accepting a replication command. Many thanks to Amit Kapila for testing and reviewing various versions of this patch.
2012-12-13 18:00:00 +01:00
/*
* If there were cascading standby servers connected to us, nudge any wal
* sender processes to notice that we've been promoted.
*/
WalSndWakeup();
/*
* If this was a promotion, request an (online) checkpoint now. This isn't
* required for consistency, but the last restartpoint might be far back,
* and in case of a crash, recovering from it might take a longer than is
* appropriate now that we're not in standby mode anymore.
*/
if (promoted)
RequestCheckpoint(CHECKPOINT_FORCE);
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
}
/*
* Callback from PerformWalRecovery(), called when we switch from crash
* recovery to archive recovery mode. Updates the control file accordingly.
*/
void
SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
{
/* initialize minRecoveryPoint to this record */
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
if (ControlFile->minRecoveryPoint < EndRecPtr)
{
ControlFile->minRecoveryPoint = EndRecPtr;
ControlFile->minRecoveryPointTLI = replayTLI;
}
/* update local copy */
LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
/*
* The startup process can update its local copy of minRecoveryPoint from
* this point.
*/
updateMinRecoveryPoint = true;
Prevent references to invalid relation pages after fresh promotion If a standby crashes after promotion before having completed its first post-recovery checkpoint, then the minimal recovery point which marks the LSN position where the cluster is able to reach consistency may be set to a position older than the first end-of-recovery checkpoint while all the WAL available should be replayed. This leads to the instance thinking that it contains inconsistent pages, causing a PANIC and a hard instance crash even if all the WAL available has not been replayed for certain sets of records replayed. When in crash recovery, minRecoveryPoint is expected to always be set to InvalidXLogRecPtr, which forces the recovery to replay all the WAL available, so this commit makes sure that the local copy of minRecoveryPoint from the control file is initialized properly and stays as it is while crash recovery is performed. Once switching to archive recovery or if crash recovery finishes, then the local copy minRecoveryPoint can be safely updated. Pavan Deolasee has reported and diagnosed the failure in the first place, and the base fix idea to rely on the local copy of minRecoveryPoint comes from Kyotaro Horiguchi, which has been expanded into a full-fledged patch by me. The test included in this commit has been written by Álvaro Herrera and Pavan Deolasee, which I have modified to make it faster and more reliable with sleep phases. Backpatch down to all supported versions where the bug appears, aka 9.3 which is where the end-of-recovery checkpoint is not run by the startup process anymore. The test gets easily supported down to 10, still it has been tested on all branches. Reported-by: Pavan Deolasee Diagnosed-by: Pavan Deolasee Reviewed-by: Pavan Deolasee, Kyotaro Horiguchi Author: Michael Paquier, Kyotaro Horiguchi, Pavan Deolasee, Álvaro Herrera Discussion: https://postgr.es/m/CABOikdPOewjNL=05K5CbNMxnNtXnQjhTx2F--4p4ruorCjukbA@mail.gmail.com
2018-07-05 03:46:18 +02:00
UpdateControlFile();
/*
* We update SharedRecoveryState while holding the lock on ControlFileLock
* so both states are consistent in shared memory.
*/
SpinLockAcquire(&XLogCtl->info_lck);
XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
SpinLockRelease(&XLogCtl->info_lck);
LWLockRelease(ControlFileLock);
}
/*
* Callback from PerformWalRecovery(), called when we reach the end of backup.
* Updates the control file accordingly.
*/
void
ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
{
/*
* We have reached the end of base backup, as indicated by pg_control. The
* data on disk is now consistent (unless minRecovery point is further
* ahead, which can happen if we crashed during previous recovery). Reset
* backupStartPoint and backupEndPoint, and update minRecoveryPoint to
* make sure we don't allow starting up at an earlier point even if
* recovery is stopped and restarted soon after this.
*/
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (ControlFile->minRecoveryPoint < EndRecPtr)
{
ControlFile->minRecoveryPoint = EndRecPtr;
ControlFile->minRecoveryPointTLI = tli;
}
ControlFile->backupStartPoint = InvalidXLogRecPtr;
ControlFile->backupEndPoint = InvalidXLogRecPtr;
ControlFile->backupEndRequired = false;
UpdateControlFile();
LWLockRelease(ControlFileLock);
}
/*
* Perform whatever XLOG actions are necessary at end of REDO.
*
* The goal here is to make sure that we'll be able to recover properly if
* we crash again. If we choose to write a checkpoint, we'll write a shutdown
* checkpoint rather than an on-line one. This is not particularly critical,
* but since we may be assigning a new TLI, using a shutdown checkpoint allows
* us to have the rule that TLI only changes in shutdown checkpoints, which
* allows some extra error checking in xlog_redo.
*/
static bool
PerformRecoveryXLogAction(void)
{
bool promoted = false;
/*
* Perform a checkpoint to update all our recovery activity to disk.
*
* Note that we write a shutdown checkpoint rather than an on-line one.
* This is not particularly critical, but since we may be assigning a new
* TLI, using a shutdown checkpoint allows us to have the rule that TLI
* only changes in shutdown checkpoints, which allows some extra error
* checking in xlog_redo.
*
* In promotion, only create a lightweight end-of-recovery record instead
* of a full checkpoint. A checkpoint is requested later, after we're
* fully out of recovery mode and already accepting queries.
*/
if (ArchiveRecoveryRequested && IsUnderPostmaster &&
PromoteIsTriggered())
{
promoted = true;
/*
* Insert a special WAL record to mark the end of recovery, since we
* aren't doing a checkpoint. That means that the checkpointer process
* may likely be in the middle of a time-smoothed restartpoint and
* could continue to be for minutes after this. That sounds strange,
* but the effect is roughly the same and it would be stranger to try
* to come out of the restartpoint and then checkpoint. We request a
* checkpoint later anyway, just for safety.
*/
CreateEndOfRecoveryRecord();
}
else
{
RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
CHECKPOINT_IMMEDIATE |
CHECKPOINT_WAIT);
}
return promoted;
}
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/*
* Is the system still in recovery?
*
* Unlike testing InRecovery, this works in any process that's connected to
* shared memory.
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
*/
bool
RecoveryInProgress(void)
{
/*
* We check shared state each time only until we leave recovery mode. We
* can't re-enter recovery, so there's no need to keep checking after the
* shared variable has once been seen false.
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
*/
if (!LocalRecoveryInProgress)
return false;
else
{
/*
* use volatile pointer to make sure we make a fresh read of the
* shared variable.
*/
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
volatile XLogCtlData *xlogctl = XLogCtl;
LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/*
* Note: We don't need a memory barrier when we're still in recovery.
* We might exit recovery immediately after return, so the caller
* can't rely on 'true' meaning that we're still in recovery anyway.
*/
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
return LocalRecoveryInProgress;
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
}
/*
* Returns current recovery state from shared memory.
*
* This returned state is kept consistent with the contents of the control
* file. See details about the possible values of RecoveryState in xlog.h.
*/
RecoveryState
GetRecoveryState(void)
{
RecoveryState retval;
SpinLockAcquire(&XLogCtl->info_lck);
retval = XLogCtl->SharedRecoveryState;
SpinLockRelease(&XLogCtl->info_lck);
return retval;
}
/*
* Is this process allowed to insert new WAL records?
*
* Ordinarily this is essentially equivalent to !RecoveryInProgress().
* But we also have provisions for forcing the result "true" or "false"
* within specific processes regardless of the global state.
*/
bool
XLogInsertAllowed(void)
{
/*
* If value is "unconditionally true" or "unconditionally false", just
* return it. This provides the normal fast path once recovery is known
* done.
*/
if (LocalXLogInsertAllowed >= 0)
return (bool) LocalXLogInsertAllowed;
/*
* Else, must check to see if we're still in recovery.
*/
if (RecoveryInProgress())
return false;
/*
* On exit from recovery, reset to "unconditionally true", since there is
* no need to keep checking.
*/
LocalXLogInsertAllowed = 1;
return true;
}
/*
* Make XLogInsertAllowed() return true in the current process only.
*
* Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
* and even call LocalSetXLogInsertAllowed() again after that.
*
* Returns the previous value of LocalXLogInsertAllowed.
*/
static int
LocalSetXLogInsertAllowed(void)
{
int oldXLogAllowed = LocalXLogInsertAllowed;
LocalXLogInsertAllowed = 1;
return oldXLogAllowed;
}
/*
* Return the current Redo pointer from shared memory.
*
* As a side-effect, the local RedoRecPtr copy is updated.
*/
XLogRecPtr
GetRedoRecPtr(void)
{
XLogRecPtr ptr;
/*
* The possibly not up-to-date copy in XlogCtl is enough. Even if we
* grabbed a WAL insertion lock to read the authoritative value in
* Insert->RedoRecPtr, someone might update it just after we've released
* the lock.
*/
SpinLockAcquire(&XLogCtl->info_lck);
ptr = XLogCtl->RedoRecPtr;
SpinLockRelease(&XLogCtl->info_lck);
if (RedoRecPtr < ptr)
RedoRecPtr = ptr;
return RedoRecPtr;
2000-10-21 17:43:36 +02:00
}
/*
* Return information needed to decide whether a modified block needs a
* full-page image to be included in the WAL record.
*
* The returned values are cached copies from backend-private memory, and
* possibly out-of-date or, indeed, uninitialized, in which case they will
* be InvalidXLogRecPtr and false, respectively. XLogInsertRecord will
* re-check them against up-to-date values, while holding the WAL insert lock.
*/
void
GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
{
*RedoRecPtr_p = RedoRecPtr;
*doPageWrites_p = doPageWrites;
}
/*
* GetInsertRecPtr -- Returns the current insert position.
*
* NOTE: The value *actually* returned is the position of the last full
* xlog page. It lags behind the real insert position by at most 1 page.
* For that, we don't need to scan through WAL insertion locks, and an
* approximation is enough for the current usage of this function.
*/
XLogRecPtr
GetInsertRecPtr(void)
{
XLogRecPtr recptr;
SpinLockAcquire(&XLogCtl->info_lck);
recptr = XLogCtl->LogwrtRqst.Write;
SpinLockRelease(&XLogCtl->info_lck);
return recptr;
}
/*
* GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
* position known to be fsync'd to disk. This should only be used on a
* system that is known not to be in recovery.
*/
XLogRecPtr
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
GetFlushRecPtr(TimeLineID *insertTLI)
{
Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
SpinLockAcquire(&XLogCtl->info_lck);
LogwrtResult = XLogCtl->LogwrtResult;
SpinLockRelease(&XLogCtl->info_lck);
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
/*
* If we're writing and flushing WAL, the time line can't be changing, so
* no lock is required.
*/
if (insertTLI)
*insertTLI = XLogCtl->InsertTimeLineID;
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
return LogwrtResult.Flush;
}
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
/*
* GetWALInsertionTimeLine -- Returns the current timeline of a system that
* is not in recovery.
*/
TimeLineID
GetWALInsertionTimeLine(void)
{
Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
/* Since the value can't be changing, no lock is required. */
return XLogCtl->InsertTimeLineID;
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
}
/*
* GetLastImportantRecPtr -- Returns the LSN of the last important record
* inserted. All records not explicitly marked as unimportant are considered
* important.
*
* The LSN is determined by computing the maximum of
* WALInsertLocks[i].lastImportantAt.
*/
XLogRecPtr
GetLastImportantRecPtr(void)
{
XLogRecPtr res = InvalidXLogRecPtr;
int i;
for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
{
XLogRecPtr last_important;
/*
* Need to take a lock to prevent torn reads of the LSN, which are
* possible on some of the supported platforms. WAL insert locks only
* support exclusive mode, so we have to use that.
*/
LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
last_important = WALInsertLocks[i].l.lastImportantAt;
LWLockRelease(&WALInsertLocks[i].l.lock);
if (res < last_important)
res = last_important;
}
return res;
}
/*
* Get the time and LSN of the last xlog segment switch
*/
pg_time_t
GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
{
pg_time_t result;
/* Need WALWriteLock, but shared lock is sufficient */
LWLockAcquire(WALWriteLock, LW_SHARED);
result = XLogCtl->lastSegSwitchTime;
*lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
LWLockRelease(WALWriteLock);
return result;
}
/*
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
* This must be called ONCE during postmaster or standalone-backend shutdown
*/
void
ShutdownXLOG(int code, Datum arg)
{
Use a ResourceOwner to track buffer pins in all cases. Historically, we've allowed auxiliary processes to take buffer pins without tracking them in a ResourceOwner. However, that creates problems for error recovery. In particular, we've seen multiple reports of assertion crashes in the startup process when it gets an error while holding a buffer pin, as for example if it gets ENOSPC during a write. In a non-assert build, the process would simply exit without releasing the pin at all. We've gotten away with that so far just because a failure exit of the startup process translates to a database crash anyhow; but any similar behavior in other aux processes could result in stuck pins and subsequent problems in vacuum. To improve this, institute a policy that we must *always* have a resowner backing any attempt to pin a buffer, which we can enforce just by removing the previous special-case code in resowner.c. Add infrastructure to make it easy to create a process-lifespan AuxProcessResourceOwner and clear out its contents at appropriate times. Replace existing ad-hoc resowner management in bgwriter.c and other aux processes with that. (Thus, while the startup process gains a resowner where it had none at all before, some other aux process types are replacing an ad-hoc resowner with this code.) Also use the AuxProcessResourceOwner to manage buffer pins taken during StartupXLOG and ShutdownXLOG, even when those are being run in a bootstrap process or a standalone backend rather than a true auxiliary process. In passing, remove some other ad-hoc resource owner creations that had gotten cargo-culted into various other places. As far as I can tell that was all unnecessary, and if it had been necessary it was incomplete, due to lacking any provision for clearing those resowners later. (Also worth noting in this connection is that a process that hasn't called InitBufferPoolBackend has no business accessing buffers; so there's more to do than just add the resowner if we want to touch buffers in processes not covered by this patch.) Although this fixes a very old bug, no back-patch, because there's no evidence of any significant problem in non-assert builds. Patch by me, pursuant to a report from Justin Pryzby. Thanks to Robert Haas and Kyotaro Horiguchi for reviews. Discussion: https://postgr.es/m/20180627233939.GA10276@telsasoft.com
2018-07-18 18:15:16 +02:00
/*
* We should have an aux process resource owner to use, and we should not
* be in a transaction that's installed some other resowner.
*/
Assert(AuxProcessResourceOwner != NULL);
Assert(CurrentResourceOwner == NULL ||
CurrentResourceOwner == AuxProcessResourceOwner);
CurrentResourceOwner = AuxProcessResourceOwner;
/* Don't be chatty in standalone mode */
ereport(IsPostmasterEnvironment ? LOG : NOTICE,
(errmsg("shutting down")));
Prevent possibility of panics during shutdown checkpoint. When the checkpointer writes the shutdown checkpoint, it checks afterwards whether any WAL has been written since it started and throws a PANIC if so. At that point, only walsenders are still active, so one might think this could not happen, but walsenders can also generate WAL, for instance in BASE_BACKUP and logical decoding related commands (e.g. via hint bits). So they can trigger this panic if such a command is run while the shutdown checkpoint is being written. To fix this, divide the walsender shutdown into two phases. First, checkpointer, itself triggered by postmaster, sends a PROCSIG_WALSND_INIT_STOPPING signal to all walsenders. If the backend is idle or runs an SQL query this causes the backend to shutdown, if logical replication is in progress all existing WAL records are processed followed by a shutdown. Otherwise this causes the walsender to switch to the "stopping" state. In this state, the walsender will reject any further replication commands. The checkpointer begins the shutdown checkpoint once all walsenders are confirmed as stopping. When the shutdown checkpoint finishes, the postmaster sends us SIGUSR2. This instructs walsender to send any outstanding WAL, including the shutdown checkpoint record, wait for it to be replicated to the standby, and then exit. Author: Andres Freund, based on an earlier patch by Michael Paquier Reported-By: Fujii Masao, Andres Freund Reviewed-By: Michael Paquier Discussion: https://postgr.es/m/20170602002912.tqlwn4gymzlxpvs2@alap3.anarazel.de Backpatch: 9.4, where logical decoding was introduced
2017-06-06 03:53:41 +02:00
/*
* Signal walsenders to move to stopping state.
*/
WalSndInitStopping();
/*
* Wait for WAL senders to be in stopping state. This prevents commands
* from writing new WAL.
*/
WalSndWaitStopping();
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
if (RecoveryInProgress())
CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
else
{
/*
* If archiving is enabled, rotate the last XLOG file so that all the
* remaining records are archived (postmaster wakes up the archiver
* process one more time at the end of shutdown). The checkpoint
* record will go to the next XLOG file and won't be archived (yet).
*/
if (XLogArchivingActive())
RequestXLogSwitch(false);
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
}
}
/*
* Log start of a checkpoint.
*/
static void
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
LogCheckpointStart(int flags, bool restartpoint)
{
if (restartpoint)
ereport(LOG,
/* translator: the placeholders show checkpoint options */
(errmsg("restartpoint starting:%s%s%s%s%s%s%s%s",
(flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
(flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
(flags & CHECKPOINT_FORCE) ? " force" : "",
(flags & CHECKPOINT_WAIT) ? " wait" : "",
(flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
(flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
(flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
else
ereport(LOG,
/* translator: the placeholders show checkpoint options */
(errmsg("checkpoint starting:%s%s%s%s%s%s%s%s",
(flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
(flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
(flags & CHECKPOINT_FORCE) ? " force" : "",
(flags & CHECKPOINT_WAIT) ? " wait" : "",
(flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
(flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
(flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
}
/*
* Log end of a checkpoint.
*/
static void
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
LogCheckpointEnd(bool restartpoint)
{
Fix and simplify some usages of TimestampDifference(). Introduce TimestampDifferenceMilliseconds() to simplify callers that would rather have the difference in milliseconds, instead of the select()-oriented seconds-and-microseconds format. This gets rid of at least one integer division per call, and it eliminates some apparently-easy-to-mess-up arithmetic. Two of these call sites were in fact wrong: * pg_prewarm's autoprewarm_main() forgot to multiply the seconds by 1000, thus ending up with a delay 1000X shorter than intended. That doesn't quite make it a busy-wait, but close. * postgres_fdw's pgfdw_get_cleanup_result() thought it needed to compute microseconds not milliseconds, thus ending up with a delay 1000X longer than intended. Somebody along the way had noticed this problem but misdiagnosed the cause, and imposed an ad-hoc 60-second limit rather than fixing the units. This was relatively harmless in context, because we don't care that much about exactly how long this delay is; still, it's wrong. There are a few more callers of TimestampDifference() that don't have a direct need for seconds-and-microseconds, but can't use TimestampDifferenceMilliseconds() either because they do need microsecond precision or because they might possibly deal with intervals long enough to overflow 32-bit milliseconds. It might be worth inventing another API to improve that, but that seems outside the scope of this patch; so those callers are untouched here. Given the fact that we are fixing some bugs, and the likelihood that future patches might want to back-patch code that uses this new API, back-patch to all supported branches. Alexey Kondratov and Tom Lane Discussion: https://postgr.es/m/3b1c053a21c07c1ed5e00be3b2b855ef@postgrespro.ru
2020-11-11 04:51:18 +01:00
long write_msecs,
sync_msecs,
total_msecs,
longest_msecs,
average_msecs;
uint64 average_sync_time;
CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
Fix and simplify some usages of TimestampDifference(). Introduce TimestampDifferenceMilliseconds() to simplify callers that would rather have the difference in milliseconds, instead of the select()-oriented seconds-and-microseconds format. This gets rid of at least one integer division per call, and it eliminates some apparently-easy-to-mess-up arithmetic. Two of these call sites were in fact wrong: * pg_prewarm's autoprewarm_main() forgot to multiply the seconds by 1000, thus ending up with a delay 1000X shorter than intended. That doesn't quite make it a busy-wait, but close. * postgres_fdw's pgfdw_get_cleanup_result() thought it needed to compute microseconds not milliseconds, thus ending up with a delay 1000X longer than intended. Somebody along the way had noticed this problem but misdiagnosed the cause, and imposed an ad-hoc 60-second limit rather than fixing the units. This was relatively harmless in context, because we don't care that much about exactly how long this delay is; still, it's wrong. There are a few more callers of TimestampDifference() that don't have a direct need for seconds-and-microseconds, but can't use TimestampDifferenceMilliseconds() either because they do need microsecond precision or because they might possibly deal with intervals long enough to overflow 32-bit milliseconds. It might be worth inventing another API to improve that, but that seems outside the scope of this patch; so those callers are untouched here. Given the fact that we are fixing some bugs, and the likelihood that future patches might want to back-patch code that uses this new API, back-patch to all supported branches. Alexey Kondratov and Tom Lane Discussion: https://postgr.es/m/3b1c053a21c07c1ed5e00be3b2b855ef@postgrespro.ru
2020-11-11 04:51:18 +01:00
write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
CheckpointStats.ckpt_sync_t);
Fix and simplify some usages of TimestampDifference(). Introduce TimestampDifferenceMilliseconds() to simplify callers that would rather have the difference in milliseconds, instead of the select()-oriented seconds-and-microseconds format. This gets rid of at least one integer division per call, and it eliminates some apparently-easy-to-mess-up arithmetic. Two of these call sites were in fact wrong: * pg_prewarm's autoprewarm_main() forgot to multiply the seconds by 1000, thus ending up with a delay 1000X shorter than intended. That doesn't quite make it a busy-wait, but close. * postgres_fdw's pgfdw_get_cleanup_result() thought it needed to compute microseconds not milliseconds, thus ending up with a delay 1000X longer than intended. Somebody along the way had noticed this problem but misdiagnosed the cause, and imposed an ad-hoc 60-second limit rather than fixing the units. This was relatively harmless in context, because we don't care that much about exactly how long this delay is; still, it's wrong. There are a few more callers of TimestampDifference() that don't have a direct need for seconds-and-microseconds, but can't use TimestampDifferenceMilliseconds() either because they do need microsecond precision or because they might possibly deal with intervals long enough to overflow 32-bit milliseconds. It might be worth inventing another API to improve that, but that seems outside the scope of this patch; so those callers are untouched here. Given the fact that we are fixing some bugs, and the likelihood that future patches might want to back-patch code that uses this new API, back-patch to all supported branches. Alexey Kondratov and Tom Lane Discussion: https://postgr.es/m/3b1c053a21c07c1ed5e00be3b2b855ef@postgrespro.ru
2020-11-11 04:51:18 +01:00
sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
CheckpointStats.ckpt_sync_end_t);
/* Accumulate checkpoint timing summary data, in milliseconds. */
PendingCheckpointerStats.m_checkpoint_write_time += write_msecs;
PendingCheckpointerStats.m_checkpoint_sync_time += sync_msecs;
/*
* All of the published timing statistics are accounted for. Only
* continue if a log message is to be written.
*/
if (!log_checkpoints)
return;
Fix and simplify some usages of TimestampDifference(). Introduce TimestampDifferenceMilliseconds() to simplify callers that would rather have the difference in milliseconds, instead of the select()-oriented seconds-and-microseconds format. This gets rid of at least one integer division per call, and it eliminates some apparently-easy-to-mess-up arithmetic. Two of these call sites were in fact wrong: * pg_prewarm's autoprewarm_main() forgot to multiply the seconds by 1000, thus ending up with a delay 1000X shorter than intended. That doesn't quite make it a busy-wait, but close. * postgres_fdw's pgfdw_get_cleanup_result() thought it needed to compute microseconds not milliseconds, thus ending up with a delay 1000X longer than intended. Somebody along the way had noticed this problem but misdiagnosed the cause, and imposed an ad-hoc 60-second limit rather than fixing the units. This was relatively harmless in context, because we don't care that much about exactly how long this delay is; still, it's wrong. There are a few more callers of TimestampDifference() that don't have a direct need for seconds-and-microseconds, but can't use TimestampDifferenceMilliseconds() either because they do need microsecond precision or because they might possibly deal with intervals long enough to overflow 32-bit milliseconds. It might be worth inventing another API to improve that, but that seems outside the scope of this patch; so those callers are untouched here. Given the fact that we are fixing some bugs, and the likelihood that future patches might want to back-patch code that uses this new API, back-patch to all supported branches. Alexey Kondratov and Tom Lane Discussion: https://postgr.es/m/3b1c053a21c07c1ed5e00be3b2b855ef@postgrespro.ru
2020-11-11 04:51:18 +01:00
total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
CheckpointStats.ckpt_end_t);
/*
* Timing values returned from CheckpointStats are in microseconds.
Fix and simplify some usages of TimestampDifference(). Introduce TimestampDifferenceMilliseconds() to simplify callers that would rather have the difference in milliseconds, instead of the select()-oriented seconds-and-microseconds format. This gets rid of at least one integer division per call, and it eliminates some apparently-easy-to-mess-up arithmetic. Two of these call sites were in fact wrong: * pg_prewarm's autoprewarm_main() forgot to multiply the seconds by 1000, thus ending up with a delay 1000X shorter than intended. That doesn't quite make it a busy-wait, but close. * postgres_fdw's pgfdw_get_cleanup_result() thought it needed to compute microseconds not milliseconds, thus ending up with a delay 1000X longer than intended. Somebody along the way had noticed this problem but misdiagnosed the cause, and imposed an ad-hoc 60-second limit rather than fixing the units. This was relatively harmless in context, because we don't care that much about exactly how long this delay is; still, it's wrong. There are a few more callers of TimestampDifference() that don't have a direct need for seconds-and-microseconds, but can't use TimestampDifferenceMilliseconds() either because they do need microsecond precision or because they might possibly deal with intervals long enough to overflow 32-bit milliseconds. It might be worth inventing another API to improve that, but that seems outside the scope of this patch; so those callers are untouched here. Given the fact that we are fixing some bugs, and the likelihood that future patches might want to back-patch code that uses this new API, back-patch to all supported branches. Alexey Kondratov and Tom Lane Discussion: https://postgr.es/m/3b1c053a21c07c1ed5e00be3b2b855ef@postgrespro.ru
2020-11-11 04:51:18 +01:00
* Convert to milliseconds for consistent printing.
*/
Fix and simplify some usages of TimestampDifference(). Introduce TimestampDifferenceMilliseconds() to simplify callers that would rather have the difference in milliseconds, instead of the select()-oriented seconds-and-microseconds format. This gets rid of at least one integer division per call, and it eliminates some apparently-easy-to-mess-up arithmetic. Two of these call sites were in fact wrong: * pg_prewarm's autoprewarm_main() forgot to multiply the seconds by 1000, thus ending up with a delay 1000X shorter than intended. That doesn't quite make it a busy-wait, but close. * postgres_fdw's pgfdw_get_cleanup_result() thought it needed to compute microseconds not milliseconds, thus ending up with a delay 1000X longer than intended. Somebody along the way had noticed this problem but misdiagnosed the cause, and imposed an ad-hoc 60-second limit rather than fixing the units. This was relatively harmless in context, because we don't care that much about exactly how long this delay is; still, it's wrong. There are a few more callers of TimestampDifference() that don't have a direct need for seconds-and-microseconds, but can't use TimestampDifferenceMilliseconds() either because they do need microsecond precision or because they might possibly deal with intervals long enough to overflow 32-bit milliseconds. It might be worth inventing another API to improve that, but that seems outside the scope of this patch; so those callers are untouched here. Given the fact that we are fixing some bugs, and the likelihood that future patches might want to back-patch code that uses this new API, back-patch to all supported branches. Alexey Kondratov and Tom Lane Discussion: https://postgr.es/m/3b1c053a21c07c1ed5e00be3b2b855ef@postgrespro.ru
2020-11-11 04:51:18 +01:00
longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
average_sync_time = 0;
if (CheckpointStats.ckpt_sync_rels > 0)
average_sync_time = CheckpointStats.ckpt_agg_sync_time /
CheckpointStats.ckpt_sync_rels;
Fix and simplify some usages of TimestampDifference(). Introduce TimestampDifferenceMilliseconds() to simplify callers that would rather have the difference in milliseconds, instead of the select()-oriented seconds-and-microseconds format. This gets rid of at least one integer division per call, and it eliminates some apparently-easy-to-mess-up arithmetic. Two of these call sites were in fact wrong: * pg_prewarm's autoprewarm_main() forgot to multiply the seconds by 1000, thus ending up with a delay 1000X shorter than intended. That doesn't quite make it a busy-wait, but close. * postgres_fdw's pgfdw_get_cleanup_result() thought it needed to compute microseconds not milliseconds, thus ending up with a delay 1000X longer than intended. Somebody along the way had noticed this problem but misdiagnosed the cause, and imposed an ad-hoc 60-second limit rather than fixing the units. This was relatively harmless in context, because we don't care that much about exactly how long this delay is; still, it's wrong. There are a few more callers of TimestampDifference() that don't have a direct need for seconds-and-microseconds, but can't use TimestampDifferenceMilliseconds() either because they do need microsecond precision or because they might possibly deal with intervals long enough to overflow 32-bit milliseconds. It might be worth inventing another API to improve that, but that seems outside the scope of this patch; so those callers are untouched here. Given the fact that we are fixing some bugs, and the likelihood that future patches might want to back-patch code that uses this new API, back-patch to all supported branches. Alexey Kondratov and Tom Lane Discussion: https://postgr.es/m/3b1c053a21c07c1ed5e00be3b2b855ef@postgrespro.ru
2020-11-11 04:51:18 +01:00
average_msecs = (long) ((average_sync_time + 999) / 1000);
if (restartpoint)
ereport(LOG,
(errmsg("restartpoint complete: wrote %d buffers (%.1f%%); "
"%d WAL file(s) added, %d removed, %d recycled; "
"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
"distance=%d kB, estimate=%d kB",
CheckpointStats.ckpt_bufs_written,
(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
CheckpointStats.ckpt_segs_added,
CheckpointStats.ckpt_segs_removed,
CheckpointStats.ckpt_segs_recycled,
write_msecs / 1000, (int) (write_msecs % 1000),
sync_msecs / 1000, (int) (sync_msecs % 1000),
total_msecs / 1000, (int) (total_msecs % 1000),
CheckpointStats.ckpt_sync_rels,
longest_msecs / 1000, (int) (longest_msecs % 1000),
average_msecs / 1000, (int) (average_msecs % 1000),
(int) (PrevCheckPointDistance / 1024.0),
(int) (CheckPointDistanceEstimate / 1024.0))));
else
ereport(LOG,
(errmsg("checkpoint complete: wrote %d buffers (%.1f%%); "
"%d WAL file(s) added, %d removed, %d recycled; "
"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
"distance=%d kB, estimate=%d kB",
CheckpointStats.ckpt_bufs_written,
(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
CheckpointStats.ckpt_segs_added,
CheckpointStats.ckpt_segs_removed,
CheckpointStats.ckpt_segs_recycled,
write_msecs / 1000, (int) (write_msecs % 1000),
sync_msecs / 1000, (int) (sync_msecs % 1000),
total_msecs / 1000, (int) (total_msecs % 1000),
CheckpointStats.ckpt_sync_rels,
longest_msecs / 1000, (int) (longest_msecs % 1000),
average_msecs / 1000, (int) (average_msecs % 1000),
(int) (PrevCheckPointDistance / 1024.0),
(int) (CheckPointDistanceEstimate / 1024.0))));
}
/*
* Update the estimate of distance between checkpoints.
*
* The estimate is used to calculate the number of WAL segments to keep
* preallocated, see XLOGfileslop().
*/
static void
UpdateCheckPointDistanceEstimate(uint64 nbytes)
{
/*
* To estimate the number of segments consumed between checkpoints, keep a
* moving average of the amount of WAL generated in previous checkpoint
* cycles. However, if the load is bursty, with quiet periods and busy
* periods, we want to cater for the peak load. So instead of a plain
* moving average, let the average decline slowly if the previous cycle
* used less WAL than estimated, but bump it up immediately if it used
* more.
*
* When checkpoints are triggered by max_wal_size, this should converge to
* CheckpointSegments * wal_segment_size,
*
* Note: This doesn't pay any attention to what caused the checkpoint.
* Checkpoints triggered manually with CHECKPOINT command, or by e.g.
* starting a base backup, are counted the same as those created
* automatically. The slow-decline will largely mask them out, if they are
* not frequent. If they are frequent, it seems reasonable to count them
* in as any others; if you issue a manual checkpoint every 5 minutes and
* never let a timed checkpoint happen, it makes sense to base the
* preallocation on that 5 minute interval rather than whatever
* checkpoint_timeout is set to.
*/
PrevCheckPointDistance = nbytes;
if (CheckPointDistanceEstimate < nbytes)
CheckPointDistanceEstimate = nbytes;
else
CheckPointDistanceEstimate =
(0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
}
/*
* Update the ps display for a process running a checkpoint. Note that
* this routine should not do any allocations so as it can be called
* from a critical section.
*/
static void
update_checkpoint_display(int flags, bool restartpoint, bool reset)
{
/*
* The status is reported only for end-of-recovery and shutdown
* checkpoints or shutdown restartpoints. Updating the ps display is
* useful in those situations as it may not be possible to rely on
* pg_stat_activity to see the status of the checkpointer or the startup
* process.
*/
if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0)
return;
if (reset)
set_ps_display("");
else
{
char activitymsg[128];
snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s",
(flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "",
(flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "",
restartpoint ? "restartpoint" : "checkpoint");
set_ps_display(activitymsg);
}
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Perform a checkpoint --- either during shutdown, or on-the-fly
*
* flags is a bitwise OR of the following:
* CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
* CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
* CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
* ignoring checkpoint_completion_target parameter.
* CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
* since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
* CHECKPOINT_END_OF_RECOVERY).
* CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
*
* Note: flags contains other bits, of interest here only for logging purposes.
* In particular note that this routine is synchronous and does not pay
* attention to CHECKPOINT_WAIT.
*
* If !shutdown then we are writing an online checkpoint. This is a very special
* kind of operation and WAL record because the checkpoint action occurs over
* a period of time yet logically occurs at just a single LSN. The logical
* position of the WAL record (redo ptr) is the same or earlier than the
* physical position. When we replay WAL we locate the checkpoint via its
* physical position then read the redo ptr and actually start replay at the
* earlier logical position. Note that we don't write *anything* to WAL at
* the logical position, so that location could be any other kind of WAL record.
* All of this mechanism allows us to continue working while we checkpoint.
* As a result, timing of actions is critical here and be careful to note that
* this function will likely take minutes to execute on a busy system.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
void
CreateCheckPoint(int flags)
{
bool shutdown;
CheckPoint checkPoint;
XLogRecPtr recptr;
XLogSegNo _logSegNo;
XLogCtlInsert *Insert = &XLogCtl->Insert;
uint32 freespace;
XLogRecPtr PriorRedoPtr;
XLogRecPtr curInsert;
XLogRecPtr last_important_lsn;
VirtualTransactionId *vxids;
int nvxids;
int oldXLogAllowed = 0;
/*
* An end-of-recovery checkpoint is really a shutdown checkpoint, just
* issued at a different time.
*/
if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
shutdown = true;
else
shutdown = false;
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/* sanity check */
if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
elog(ERROR, "can't create a checkpoint during recovery");
/*
* Prepare to accumulate statistics.
*
* Note: because it is possible for log_checkpoints to change while a
* checkpoint proceeds, we always accumulate stats, even if
* log_checkpoints is currently off.
*/
MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
/*
* Let smgr prepare for checkpoint; this has to happen outside the
* critical section and before we determine the REDO pointer. Note that
* smgr must not do anything that'd have to be undone if we decide no
* checkpoint is needed.
*/
SyncPreCheckpoint();
/*
* Use a critical section to force system panic if we have trouble.
*/
START_CRIT_SECTION();
if (shutdown)
{
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->state = DB_SHUTDOWNING;
UpdateControlFile();
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
LWLockRelease(ControlFileLock);
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/* Begin filling in the checkpoint WAL record */
MemSet(&checkPoint, 0, sizeof(checkPoint));
checkPoint.time = (pg_time_t) time(NULL);
/*
* For Hot Standby, derive the oldestActiveXid before we fix the redo
* pointer. This allows us to begin accumulating changes to assemble our
* starting snapshot of locks and transactions.
*/
if (!shutdown && XLogStandbyInfoActive())
checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
else
checkPoint.oldestActiveXid = InvalidTransactionId;
/*
* Get location of last important record before acquiring insert locks (as
* GetLastImportantRecPtr() also locks WAL locks).
*/
last_important_lsn = GetLastImportantRecPtr();
/*
* We must block concurrent insertions while examining insert state to
* determine the checkpoint REDO pointer.
*/
WALInsertLockAcquireExclusive();
curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* If this isn't a shutdown or forced checkpoint, and if there has been no
* WAL activity requiring a checkpoint, skip it. The idea here is to
* avoid inserting duplicate checkpoints when the system is idle.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
CHECKPOINT_FORCE)) == 0)
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
{
if (last_important_lsn == ControlFile->checkPoint)
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
{
WALInsertLockRelease();
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
END_CRIT_SECTION();
ereport(DEBUG1,
(errmsg_internal("checkpoint skipped because system is idle")));
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
return;
}
}
/*
* An end-of-recovery checkpoint is created before anyone is allowed to
* write WAL. To allow us to write the checkpoint record, temporarily
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
* enable XLogInsertAllowed.
*/
if (flags & CHECKPOINT_END_OF_RECOVERY)
oldXLogAllowed = LocalSetXLogInsertAllowed();
checkPoint.ThisTimeLineID = XLogCtl->InsertTimeLineID;
if (flags & CHECKPOINT_END_OF_RECOVERY)
checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
else
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
checkPoint.PrevTimeLineID = checkPoint.ThisTimeLineID;
checkPoint.fullPageWrites = Insert->fullPageWrites;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Compute new REDO record ptr = location of next XLOG record.
*
* NB: this is NOT necessarily where the checkpoint record itself will be,
* since other backends may insert more XLOG records while we're off doing
* the buffer flush work. Those XLOG records are logically after the
* checkpoint, even though physically before it. Got that?
*/
freespace = INSERT_FREESPACE(curInsert);
if (freespace == 0)
{
if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
curInsert += SizeOfXLogLongPHD;
else
curInsert += SizeOfXLogShortPHD;
}
checkPoint.redo = curInsert;
2001-03-22 05:01:46 +01:00
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Here we update the shared RedoRecPtr for future XLogInsert calls; this
* must be done while holding all the insertion locks.
*
* Note: if we fail to complete the checkpoint, RedoRecPtr will be left
* pointing past where it really needs to point. This is okay; the only
* consequence is that XLogInsert might back up whole buffers that it
* didn't really need to. We can't postpone advancing RedoRecPtr because
* XLogInserts that happen while we are dumping buffers must assume that
* their buffer changes are not included in the checkpoint.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
2001-03-22 05:01:46 +01:00
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Now we can release the WAL insertion locks, allowing other xacts to
* proceed while we are flushing disk buffers.
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
WALInsertLockRelease();
/* Update the info_lck-protected copy of RedoRecPtr as well */
SpinLockAcquire(&XLogCtl->info_lck);
XLogCtl->RedoRecPtr = checkPoint.redo;
SpinLockRelease(&XLogCtl->info_lck);
/*
* If enabled, log checkpoint start. We postpone this until now so as not
* to log anything if we decided to skip the checkpoint.
*/
if (log_checkpoints)
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
LogCheckpointStart(flags, false);
/* Update the process title */
update_checkpoint_display(flags, false, false);
TRACE_POSTGRESQL_CHECKPOINT_START(flags);
/*
* Get the other info we need for the checkpoint record.
*
* We don't need to save oldestClogXid in the checkpoint, it only matters
* for the short period in which clog is being truncated, and if we crash
* during that we'll redo the clog truncation and fix up oldestClogXid
* there.
*/
LWLockAcquire(XidGenLock, LW_SHARED);
checkPoint.nextXid = ShmemVariableCache->nextXid;
checkPoint.oldestXid = ShmemVariableCache->oldestXid;
checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
LWLockRelease(XidGenLock);
LWLockAcquire(CommitTsLock, LW_SHARED);
checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
LWLockRelease(CommitTsLock);
LWLockAcquire(OidGenLock, LW_SHARED);
checkPoint.nextOid = ShmemVariableCache->nextOid;
if (!shutdown)
checkPoint.nextOid += ShmemVariableCache->oidCount;
LWLockRelease(OidGenLock);
MultiXactGetCheckptMulti(shutdown,
&checkPoint.nextMulti,
&checkPoint.nextMultiOffset,
&checkPoint.oldestMulti,
&checkPoint.oldestMultiDB);
/*
* Having constructed the checkpoint record, ensure all shmem disk buffers
* and commit-log buffers are flushed to disk.
*
* This I/O could fail for various reasons. If so, we will fail to
* complete the checkpoint, but there is no reason to force a system
* panic. Accordingly, exit critical section while doing it.
*/
END_CRIT_SECTION();
/*
* In some cases there are groups of actions that must all occur on one
* side or the other of a checkpoint record. Before flushing the
* checkpoint record we must explicitly wait for any backend currently
* performing those groups of actions.
*
* One example is end of transaction, so we must wait for any transactions
* that are currently in commit critical sections. If an xact inserted
* its commit record into XLOG just before the REDO point, then a crash
* restart from the REDO point would not replay that record, which means
* that our flushing had better include the xact's update of pg_xact. So
* we wait till he's out of his commit critical section before proceeding.
* See notes in RecordTransactionCommit().
*
* Because we've already released the insertion locks, this test is a bit
* fuzzy: it is possible that we will wait for xacts we didn't really need
* to wait for. But the delay should be short and it seems better to make
* checkpoint take a bit longer than to hold off insertions longer than
* necessary. (In fact, the whole reason we have this issue is that xact.c
* does commit record XLOG insertion and clog update as two separate steps
* protected by different locks, but again that seems best on grounds of
* minimizing lock contention.)
*
* A transaction that has not yet set delayChkpt when we look cannot be at
* risk, since he's not inserted his commit record yet; and one that's
* already cleared it is not at risk either, since he's done fixing clog
* and we will correctly flush the update below. So we cannot miss any
* xacts we need to wait for.
*/
vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
if (nvxids > 0)
{
do
{
pg_usleep(10000L); /* wait for 10 msec */
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
DELAY_CHKPT_START));
}
pfree(vxids);
CheckPointGuts(checkPoint.redo, flags);
vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
if (nvxids > 0)
{
do
{
pg_usleep(10000L); /* wait for 10 msec */
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
DELAY_CHKPT_COMPLETE));
}
pfree(vxids);
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
/*
* Take a snapshot of running transactions and write this to WAL. This
* allows us to reconstruct the state of running transactions during
* archive recovery, if required. Skip, if this info disabled.
*
* If we are shutting down, or Startup process is completing crash
* recovery we don't need to write running xact data.
*/
if (!shutdown && XLogStandbyInfoActive())
LogStandbySnapshot();
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
START_CRIT_SECTION();
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Now insert the checkpoint record into XLOG.
*/
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogBeginInsert();
XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
recptr = XLogInsert(RM_XLOG_ID,
shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLOG_CHECKPOINT_ONLINE);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
XLogFlush(recptr);
/*
* We mustn't write any new WAL after a shutdown checkpoint, or it will be
* overwritten at next startup. No-one should even try, this just allows
* sanity-checking. In the case of an end-of-recovery checkpoint, we want
* to just temporarily disable writing until the system has exited
* recovery.
*/
if (shutdown)
{
if (flags & CHECKPOINT_END_OF_RECOVERY)
LocalXLogInsertAllowed = oldXLogAllowed;
else
LocalXLogInsertAllowed = 0; /* never again write WAL */
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* We now have ProcLastRecPtr = start of actual checkpoint record, recptr
* = end of actual checkpoint record.
*/
if (shutdown && checkPoint.redo != ProcLastRecPtr)
ereport(PANIC,
(errmsg("concurrent write-ahead log activity while database system is shutting down")));
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Remember the prior checkpoint's redo ptr for
* UpdateCheckPointDistanceEstimate()
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
PriorRedoPtr = ControlFile->checkPointCopy.redo;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Update the control file.
*/
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (shutdown)
ControlFile->state = DB_SHUTDOWNED;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
ControlFile->checkPoint = ProcLastRecPtr;
ControlFile->checkPointCopy = checkPoint;
/* crash recovery should always recover to the end of WAL */
ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
ControlFile->minRecoveryPointTLI = 0;
/*
* Persist unloggedLSN value. It's reset on crash recovery, so this goes
* unused on non-shutdown checkpoints, but seems useful to store it always
* for debugging purposes.
*/
SpinLockAcquire(&XLogCtl->ulsn_lck);
ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
SpinLockRelease(&XLogCtl->ulsn_lck);
UpdateControlFile();
LWLockRelease(ControlFileLock);
/* Update shared-memory copy of checkpoint XID/epoch */
SpinLockAcquire(&XLogCtl->info_lck);
XLogCtl->ckptFullXid = checkPoint.nextXid;
SpinLockRelease(&XLogCtl->info_lck);
/*
* We are now done with critical updates; no need for system panic if we
* have trouble while fooling with old log segments.
*/
END_CRIT_SECTION();
/*
* Let smgr do post-checkpoint cleanup (eg, deleting old files).
*/
SyncPostCheckpoint();
/*
* Update the average distance between checkpoints if the prior checkpoint
* exists.
*/
if (PriorRedoPtr != InvalidXLogRecPtr)
UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
/*
* Delete old log files, those no longer needed for last checkpoint to
* prevent the disk holding the xlog from growing full.
*/
XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
KeepLogSeg(recptr, &_logSegNo);
if (InvalidateObsoleteReplicationSlots(_logSegNo))
{
/*
* Some slots have been invalidated; recalculate the old-segment
* horizon, starting again from RedoRecPtr.
*/
XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
KeepLogSeg(recptr, &_logSegNo);
}
_logSegNo--;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr,
checkPoint.ThisTimeLineID);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Make more log segments if needed. (Do this after recycling old log
* segments, since that may supply some of the needed files.)
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
if (!shutdown)
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
PreallocXlogFiles(recptr, checkPoint.ThisTimeLineID);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Truncate pg_subtrans if possible. We can throw away all data before
* the oldest XMIN of any running transaction. No future transaction will
* attempt to reference any pg_subtrans entry older than that (see Asserts
* in subtrans.c). During recovery, though, we mustn't do this because
* StartupSUBTRANS hasn't been called yet.
*/
if (!RecoveryInProgress())
snapshot scalability: Don't compute global horizons while building snapshots. To make GetSnapshotData() more scalable, it cannot not look at at each proc's xmin: While snapshot contents do not need to change whenever a read-only transaction commits or a snapshot is released, a proc's xmin is modified in those cases. The frequency of xmin modifications leads to, particularly on higher core count systems, many cache misses inside GetSnapshotData(), despite the data underlying a snapshot not changing. That is the most significant source of GetSnapshotData() scaling poorly on larger systems. Without accessing xmins, GetSnapshotData() cannot calculate accurate horizons / thresholds as it has so far. But we don't really have to: The horizons don't actually change that much between GetSnapshotData() calls. Nor are the horizons actually used every time a snapshot is built. The trick this commit introduces is to delay computation of accurate horizons until there use and using horizon boundaries to determine whether accurate horizons need to be computed. The use of RecentGlobal[Data]Xmin to decide whether a row version could be removed has been replaces with new GlobalVisTest* functions. These use two thresholds to determine whether a row can be pruned: 1) definitely_needed, indicating that rows deleted by XIDs >= definitely_needed are definitely still visible. 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can definitely be removed GetSnapshotData() updates definitely_needed to be the xmin of the computed snapshot. When testing whether a row can be removed (with GlobalVisTestIsRemovableXid()) and the tested XID falls in between the two (i.e. XID >= maybe_needed && XID < definitely_needed) the boundaries can be recomputed to be more accurate. As it is not cheap to compute accurate boundaries, we limit the number of times that happens in short succession. As the boundaries used by GlobalVisTestIsRemovableXid() are never reset (with maybe_needed updated by GetSnapshotData()), it is likely that further test can benefit from an earlier computation of accurate horizons. To avoid regressing performance when old_snapshot_threshold is set (as that requires an accurate horizon to be computed), heap_page_prune_opt() doesn't unconditionally call TransactionIdLimitedForOldSnapshots() anymore. Both the computation of the limited horizon, and the triggering of errors (with SetOldSnapshotThresholdTimestamp()) is now only done when necessary to remove tuples. This commit just removes the accesses to PGXACT->xmin from GetSnapshotData(), but other members of PGXACT residing in the same cache line are accessed. Therefore this in itself does not result in a significant improvement. Subsequent commits will take advantage of the fact that GetSnapshotData() now does not need to access xmins anymore. Note: This contains a workaround in heap_page_prune_opt() to keep the snapshot_too_old tests working. While that workaround is ugly, the tests currently are not meaningful, and it seems best to address them separately. Author: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Reviewed-By: Thomas Munro <thomas.munro@gmail.com> Reviewed-By: David Rowley <dgrowleyml@gmail.com> Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
2020-08-13 01:03:49 +02:00
TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
/* Real work is done; log and update stats. */
LogCheckpointEnd(false);
/* Reset the process title */
update_checkpoint_display(flags, false, true);
TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
NBuffers,
CheckpointStats.ckpt_segs_added,
CheckpointStats.ckpt_segs_removed,
CheckpointStats.ckpt_segs_recycled);
}
2000-10-21 17:43:36 +02:00
/*
* Mark the end of recovery in WAL though without running a full checkpoint.
* We can expect that a restartpoint is likely to be in progress as we
* do this, though we are unwilling to wait for it to complete.
*
* CreateRestartPoint() allows for the case where recovery may end before
* the restartpoint completes so there is no concern of concurrent behaviour.
*/
static void
CreateEndOfRecoveryRecord(void)
{
xl_end_of_recovery xlrec;
XLogRecPtr recptr;
/* sanity check */
if (!RecoveryInProgress())
elog(ERROR, "can only be used to end recovery");
xlrec.end_time = GetCurrentTimestamp();
WALInsertLockAcquireExclusive();
xlrec.ThisTimeLineID = XLogCtl->InsertTimeLineID;
xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
WALInsertLockRelease();
START_CRIT_SECTION();
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
XLogFlush(recptr);
/*
* Update the control file so that crash recovery can follow the timeline
* changes to this point.
*/
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->minRecoveryPoint = recptr;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID;
UpdateControlFile();
LWLockRelease(ControlFileLock);
END_CRIT_SECTION();
}
Fix WAL replay in presence of an incomplete record Physical replication always ships WAL segment files to replicas once they are complete. This is a problem if one WAL record is split across a segment boundary and the primary server crashes before writing down the segment with the next portion of the WAL record: WAL writing after crash recovery would happily resume at the point where the broken record started, overwriting that record ... but any standby or backup may have already received a copy of that segment, and they are not rewinding. This causes standbys to stop following the primary after the latter crashes: LOG: invalid contrecord length 7262 at A8/D9FFFBC8 because the standby is still trying to read the continuation record (contrecord) for the original long WAL record, but it is not there and it will never be. A workaround is to stop the replica, delete the WAL file, and restart it -- at which point a fresh copy is brought over from the primary. But that's pretty labor intensive, and I bet many users would just give up and re-clone the standby instead. A fix for this problem was already attempted in commit 515e3d84a0b5, but it only addressed the case for the scenario of WAL archiving, so streaming replication would still be a problem (as well as other things such as taking a filesystem-level backup while the server is down after having crashed), and it had performance scalability problems too; so it had to be reverted. This commit fixes the problem using an approach suggested by Andres Freund, whereby the initial portion(s) of the split-up WAL record are kept, and a special type of WAL record is written where the contrecord was lost, so that WAL replay in the replica knows to skip the broken parts. With this approach, we can continue to stream/archive segment files as soon as they are complete, and replay of the broken records will proceed across the crash point without a hitch. Because a new type of WAL record is added, users should be careful to upgrade standbys first, primaries later. Otherwise they risk the standby being unable to start if the primary happens to write such a record. A new TAP test that exercises this is added, but the portability of it is yet to be seen. This has been wrong since the introduction of physical replication, so backpatch all the way back. In stable branches, keep the new XLogReaderState members at the end of the struct, to avoid an ABI break. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Nathan Bossart <bossartn@amazon.com> Discussion: https://postgr.es/m/202108232252.dh7uxf6oxwcy@alvherre.pgsql
2021-09-29 16:21:51 +02:00
/*
* Write an OVERWRITE_CONTRECORD message.
*
* When on WAL replay we expect a continuation record at the start of a page
* that is not there, recovery ends and WAL writing resumes at that point.
* But it's wrong to resume writing new WAL back at the start of the record
* that was broken, because downstream consumers of that WAL (physical
* replicas) are not prepared to "rewind". So the first action after
* finishing replay of all valid WAL must be to write a record of this type
* at the point where the contrecord was missing; to support xlogreader
* detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
* to the page header where the record occurs. xlogreader has an ad-hoc
* mechanism to report metadata about the broken record, which is what we
* use here.
*
* At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
* skip the record it was reading, and pass back the LSN of the skipped
* record, so that its caller can verify (on "replay" of that record) that the
* XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
*
* 'aborted_lsn' is the beginning position of the record that was incomplete.
* It is included in the WAL record. 'pagePtr' and 'newTLI' point to the
* beginning of the XLOG page where the record is to be inserted. They must
* match the current WAL insert position, they're passed here just so that we
* can verify that.
Fix WAL replay in presence of an incomplete record Physical replication always ships WAL segment files to replicas once they are complete. This is a problem if one WAL record is split across a segment boundary and the primary server crashes before writing down the segment with the next portion of the WAL record: WAL writing after crash recovery would happily resume at the point where the broken record started, overwriting that record ... but any standby or backup may have already received a copy of that segment, and they are not rewinding. This causes standbys to stop following the primary after the latter crashes: LOG: invalid contrecord length 7262 at A8/D9FFFBC8 because the standby is still trying to read the continuation record (contrecord) for the original long WAL record, but it is not there and it will never be. A workaround is to stop the replica, delete the WAL file, and restart it -- at which point a fresh copy is brought over from the primary. But that's pretty labor intensive, and I bet many users would just give up and re-clone the standby instead. A fix for this problem was already attempted in commit 515e3d84a0b5, but it only addressed the case for the scenario of WAL archiving, so streaming replication would still be a problem (as well as other things such as taking a filesystem-level backup while the server is down after having crashed), and it had performance scalability problems too; so it had to be reverted. This commit fixes the problem using an approach suggested by Andres Freund, whereby the initial portion(s) of the split-up WAL record are kept, and a special type of WAL record is written where the contrecord was lost, so that WAL replay in the replica knows to skip the broken parts. With this approach, we can continue to stream/archive segment files as soon as they are complete, and replay of the broken records will proceed across the crash point without a hitch. Because a new type of WAL record is added, users should be careful to upgrade standbys first, primaries later. Otherwise they risk the standby being unable to start if the primary happens to write such a record. A new TAP test that exercises this is added, but the portability of it is yet to be seen. This has been wrong since the introduction of physical replication, so backpatch all the way back. In stable branches, keep the new XLogReaderState members at the end of the struct, to avoid an ABI break. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Nathan Bossart <bossartn@amazon.com> Discussion: https://postgr.es/m/202108232252.dh7uxf6oxwcy@alvherre.pgsql
2021-09-29 16:21:51 +02:00
*/
static XLogRecPtr
CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
TimeLineID newTLI)
Fix WAL replay in presence of an incomplete record Physical replication always ships WAL segment files to replicas once they are complete. This is a problem if one WAL record is split across a segment boundary and the primary server crashes before writing down the segment with the next portion of the WAL record: WAL writing after crash recovery would happily resume at the point where the broken record started, overwriting that record ... but any standby or backup may have already received a copy of that segment, and they are not rewinding. This causes standbys to stop following the primary after the latter crashes: LOG: invalid contrecord length 7262 at A8/D9FFFBC8 because the standby is still trying to read the continuation record (contrecord) for the original long WAL record, but it is not there and it will never be. A workaround is to stop the replica, delete the WAL file, and restart it -- at which point a fresh copy is brought over from the primary. But that's pretty labor intensive, and I bet many users would just give up and re-clone the standby instead. A fix for this problem was already attempted in commit 515e3d84a0b5, but it only addressed the case for the scenario of WAL archiving, so streaming replication would still be a problem (as well as other things such as taking a filesystem-level backup while the server is down after having crashed), and it had performance scalability problems too; so it had to be reverted. This commit fixes the problem using an approach suggested by Andres Freund, whereby the initial portion(s) of the split-up WAL record are kept, and a special type of WAL record is written where the contrecord was lost, so that WAL replay in the replica knows to skip the broken parts. With this approach, we can continue to stream/archive segment files as soon as they are complete, and replay of the broken records will proceed across the crash point without a hitch. Because a new type of WAL record is added, users should be careful to upgrade standbys first, primaries later. Otherwise they risk the standby being unable to start if the primary happens to write such a record. A new TAP test that exercises this is added, but the portability of it is yet to be seen. This has been wrong since the introduction of physical replication, so backpatch all the way back. In stable branches, keep the new XLogReaderState members at the end of the struct, to avoid an ABI break. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Nathan Bossart <bossartn@amazon.com> Discussion: https://postgr.es/m/202108232252.dh7uxf6oxwcy@alvherre.pgsql
2021-09-29 16:21:51 +02:00
{
xl_overwrite_contrecord xlrec;
XLogRecPtr recptr;
XLogPageHeader pagehdr;
XLogRecPtr startPos;
Fix WAL replay in presence of an incomplete record Physical replication always ships WAL segment files to replicas once they are complete. This is a problem if one WAL record is split across a segment boundary and the primary server crashes before writing down the segment with the next portion of the WAL record: WAL writing after crash recovery would happily resume at the point where the broken record started, overwriting that record ... but any standby or backup may have already received a copy of that segment, and they are not rewinding. This causes standbys to stop following the primary after the latter crashes: LOG: invalid contrecord length 7262 at A8/D9FFFBC8 because the standby is still trying to read the continuation record (contrecord) for the original long WAL record, but it is not there and it will never be. A workaround is to stop the replica, delete the WAL file, and restart it -- at which point a fresh copy is brought over from the primary. But that's pretty labor intensive, and I bet many users would just give up and re-clone the standby instead. A fix for this problem was already attempted in commit 515e3d84a0b5, but it only addressed the case for the scenario of WAL archiving, so streaming replication would still be a problem (as well as other things such as taking a filesystem-level backup while the server is down after having crashed), and it had performance scalability problems too; so it had to be reverted. This commit fixes the problem using an approach suggested by Andres Freund, whereby the initial portion(s) of the split-up WAL record are kept, and a special type of WAL record is written where the contrecord was lost, so that WAL replay in the replica knows to skip the broken parts. With this approach, we can continue to stream/archive segment files as soon as they are complete, and replay of the broken records will proceed across the crash point without a hitch. Because a new type of WAL record is added, users should be careful to upgrade standbys first, primaries later. Otherwise they risk the standby being unable to start if the primary happens to write such a record. A new TAP test that exercises this is added, but the portability of it is yet to be seen. This has been wrong since the introduction of physical replication, so backpatch all the way back. In stable branches, keep the new XLogReaderState members at the end of the struct, to avoid an ABI break. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Nathan Bossart <bossartn@amazon.com> Discussion: https://postgr.es/m/202108232252.dh7uxf6oxwcy@alvherre.pgsql
2021-09-29 16:21:51 +02:00
/* sanity checks */
Fix WAL replay in presence of an incomplete record Physical replication always ships WAL segment files to replicas once they are complete. This is a problem if one WAL record is split across a segment boundary and the primary server crashes before writing down the segment with the next portion of the WAL record: WAL writing after crash recovery would happily resume at the point where the broken record started, overwriting that record ... but any standby or backup may have already received a copy of that segment, and they are not rewinding. This causes standbys to stop following the primary after the latter crashes: LOG: invalid contrecord length 7262 at A8/D9FFFBC8 because the standby is still trying to read the continuation record (contrecord) for the original long WAL record, but it is not there and it will never be. A workaround is to stop the replica, delete the WAL file, and restart it -- at which point a fresh copy is brought over from the primary. But that's pretty labor intensive, and I bet many users would just give up and re-clone the standby instead. A fix for this problem was already attempted in commit 515e3d84a0b5, but it only addressed the case for the scenario of WAL archiving, so streaming replication would still be a problem (as well as other things such as taking a filesystem-level backup while the server is down after having crashed), and it had performance scalability problems too; so it had to be reverted. This commit fixes the problem using an approach suggested by Andres Freund, whereby the initial portion(s) of the split-up WAL record are kept, and a special type of WAL record is written where the contrecord was lost, so that WAL replay in the replica knows to skip the broken parts. With this approach, we can continue to stream/archive segment files as soon as they are complete, and replay of the broken records will proceed across the crash point without a hitch. Because a new type of WAL record is added, users should be careful to upgrade standbys first, primaries later. Otherwise they risk the standby being unable to start if the primary happens to write such a record. A new TAP test that exercises this is added, but the portability of it is yet to be seen. This has been wrong since the introduction of physical replication, so backpatch all the way back. In stable branches, keep the new XLogReaderState members at the end of the struct, to avoid an ABI break. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Nathan Bossart <bossartn@amazon.com> Discussion: https://postgr.es/m/202108232252.dh7uxf6oxwcy@alvherre.pgsql
2021-09-29 16:21:51 +02:00
if (!RecoveryInProgress())
elog(ERROR, "can only be used at end of recovery");
if (pagePtr % XLOG_BLCKSZ != 0)
elog(ERROR, "invalid position for missing continuation record %X/%X",
LSN_FORMAT_ARGS(pagePtr));
/* The current WAL insert position should be right after the page header */
startPos = pagePtr;
if (XLogSegmentOffset(startPos, wal_segment_size) == 0)
startPos += SizeOfXLogLongPHD;
else
startPos += SizeOfXLogShortPHD;
recptr = GetXLogInsertRecPtr();
if (recptr != startPos)
elog(ERROR, "invalid WAL insert position %X/%X for OVERWRITE_CONTRECORD",
LSN_FORMAT_ARGS(recptr));
Fix WAL replay in presence of an incomplete record Physical replication always ships WAL segment files to replicas once they are complete. This is a problem if one WAL record is split across a segment boundary and the primary server crashes before writing down the segment with the next portion of the WAL record: WAL writing after crash recovery would happily resume at the point where the broken record started, overwriting that record ... but any standby or backup may have already received a copy of that segment, and they are not rewinding. This causes standbys to stop following the primary after the latter crashes: LOG: invalid contrecord length 7262 at A8/D9FFFBC8 because the standby is still trying to read the continuation record (contrecord) for the original long WAL record, but it is not there and it will never be. A workaround is to stop the replica, delete the WAL file, and restart it -- at which point a fresh copy is brought over from the primary. But that's pretty labor intensive, and I bet many users would just give up and re-clone the standby instead. A fix for this problem was already attempted in commit 515e3d84a0b5, but it only addressed the case for the scenario of WAL archiving, so streaming replication would still be a problem (as well as other things such as taking a filesystem-level backup while the server is down after having crashed), and it had performance scalability problems too; so it had to be reverted. This commit fixes the problem using an approach suggested by Andres Freund, whereby the initial portion(s) of the split-up WAL record are kept, and a special type of WAL record is written where the contrecord was lost, so that WAL replay in the replica knows to skip the broken parts. With this approach, we can continue to stream/archive segment files as soon as they are complete, and replay of the broken records will proceed across the crash point without a hitch. Because a new type of WAL record is added, users should be careful to upgrade standbys first, primaries later. Otherwise they risk the standby being unable to start if the primary happens to write such a record. A new TAP test that exercises this is added, but the portability of it is yet to be seen. This has been wrong since the introduction of physical replication, so backpatch all the way back. In stable branches, keep the new XLogReaderState members at the end of the struct, to avoid an ABI break. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Nathan Bossart <bossartn@amazon.com> Discussion: https://postgr.es/m/202108232252.dh7uxf6oxwcy@alvherre.pgsql
2021-09-29 16:21:51 +02:00
START_CRIT_SECTION();
/*
* Initialize the XLOG page header (by GetXLogBuffer), and set the
* XLP_FIRST_IS_OVERWRITE_CONTRECORD flag.
*
* No other backend is allowed to write WAL yet, so acquiring the WAL
* insertion lock is just pro forma.
*/
WALInsertLockAcquire();
pagehdr = (XLogPageHeader) GetXLogBuffer(pagePtr, newTLI);
pagehdr->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
WALInsertLockRelease();
/*
* Insert the XLOG_OVERWRITE_CONTRECORD record as the first record on the
* page. We know it becomes the first record, because no other backend is
* allowed to write WAL yet.
*/
Fix WAL replay in presence of an incomplete record Physical replication always ships WAL segment files to replicas once they are complete. This is a problem if one WAL record is split across a segment boundary and the primary server crashes before writing down the segment with the next portion of the WAL record: WAL writing after crash recovery would happily resume at the point where the broken record started, overwriting that record ... but any standby or backup may have already received a copy of that segment, and they are not rewinding. This causes standbys to stop following the primary after the latter crashes: LOG: invalid contrecord length 7262 at A8/D9FFFBC8 because the standby is still trying to read the continuation record (contrecord) for the original long WAL record, but it is not there and it will never be. A workaround is to stop the replica, delete the WAL file, and restart it -- at which point a fresh copy is brought over from the primary. But that's pretty labor intensive, and I bet many users would just give up and re-clone the standby instead. A fix for this problem was already attempted in commit 515e3d84a0b5, but it only addressed the case for the scenario of WAL archiving, so streaming replication would still be a problem (as well as other things such as taking a filesystem-level backup while the server is down after having crashed), and it had performance scalability problems too; so it had to be reverted. This commit fixes the problem using an approach suggested by Andres Freund, whereby the initial portion(s) of the split-up WAL record are kept, and a special type of WAL record is written where the contrecord was lost, so that WAL replay in the replica knows to skip the broken parts. With this approach, we can continue to stream/archive segment files as soon as they are complete, and replay of the broken records will proceed across the crash point without a hitch. Because a new type of WAL record is added, users should be careful to upgrade standbys first, primaries later. Otherwise they risk the standby being unable to start if the primary happens to write such a record. A new TAP test that exercises this is added, but the portability of it is yet to be seen. This has been wrong since the introduction of physical replication, so backpatch all the way back. In stable branches, keep the new XLogReaderState members at the end of the struct, to avoid an ABI break. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Nathan Bossart <bossartn@amazon.com> Discussion: https://postgr.es/m/202108232252.dh7uxf6oxwcy@alvherre.pgsql
2021-09-29 16:21:51 +02:00
XLogBeginInsert();
xlrec.overwritten_lsn = aborted_lsn;
xlrec.overwrite_time = GetCurrentTimestamp();
Fix WAL replay in presence of an incomplete record Physical replication always ships WAL segment files to replicas once they are complete. This is a problem if one WAL record is split across a segment boundary and the primary server crashes before writing down the segment with the next portion of the WAL record: WAL writing after crash recovery would happily resume at the point where the broken record started, overwriting that record ... but any standby or backup may have already received a copy of that segment, and they are not rewinding. This causes standbys to stop following the primary after the latter crashes: LOG: invalid contrecord length 7262 at A8/D9FFFBC8 because the standby is still trying to read the continuation record (contrecord) for the original long WAL record, but it is not there and it will never be. A workaround is to stop the replica, delete the WAL file, and restart it -- at which point a fresh copy is brought over from the primary. But that's pretty labor intensive, and I bet many users would just give up and re-clone the standby instead. A fix for this problem was already attempted in commit 515e3d84a0b5, but it only addressed the case for the scenario of WAL archiving, so streaming replication would still be a problem (as well as other things such as taking a filesystem-level backup while the server is down after having crashed), and it had performance scalability problems too; so it had to be reverted. This commit fixes the problem using an approach suggested by Andres Freund, whereby the initial portion(s) of the split-up WAL record are kept, and a special type of WAL record is written where the contrecord was lost, so that WAL replay in the replica knows to skip the broken parts. With this approach, we can continue to stream/archive segment files as soon as they are complete, and replay of the broken records will proceed across the crash point without a hitch. Because a new type of WAL record is added, users should be careful to upgrade standbys first, primaries later. Otherwise they risk the standby being unable to start if the primary happens to write such a record. A new TAP test that exercises this is added, but the portability of it is yet to be seen. This has been wrong since the introduction of physical replication, so backpatch all the way back. In stable branches, keep the new XLogReaderState members at the end of the struct, to avoid an ABI break. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Nathan Bossart <bossartn@amazon.com> Discussion: https://postgr.es/m/202108232252.dh7uxf6oxwcy@alvherre.pgsql
2021-09-29 16:21:51 +02:00
XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord));
recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
/* check that the record was inserted to the right place */
if (ProcLastRecPtr != startPos)
elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%X",
LSN_FORMAT_ARGS(ProcLastRecPtr));
Fix WAL replay in presence of an incomplete record Physical replication always ships WAL segment files to replicas once they are complete. This is a problem if one WAL record is split across a segment boundary and the primary server crashes before writing down the segment with the next portion of the WAL record: WAL writing after crash recovery would happily resume at the point where the broken record started, overwriting that record ... but any standby or backup may have already received a copy of that segment, and they are not rewinding. This causes standbys to stop following the primary after the latter crashes: LOG: invalid contrecord length 7262 at A8/D9FFFBC8 because the standby is still trying to read the continuation record (contrecord) for the original long WAL record, but it is not there and it will never be. A workaround is to stop the replica, delete the WAL file, and restart it -- at which point a fresh copy is brought over from the primary. But that's pretty labor intensive, and I bet many users would just give up and re-clone the standby instead. A fix for this problem was already attempted in commit 515e3d84a0b5, but it only addressed the case for the scenario of WAL archiving, so streaming replication would still be a problem (as well as other things such as taking a filesystem-level backup while the server is down after having crashed), and it had performance scalability problems too; so it had to be reverted. This commit fixes the problem using an approach suggested by Andres Freund, whereby the initial portion(s) of the split-up WAL record are kept, and a special type of WAL record is written where the contrecord was lost, so that WAL replay in the replica knows to skip the broken parts. With this approach, we can continue to stream/archive segment files as soon as they are complete, and replay of the broken records will proceed across the crash point without a hitch. Because a new type of WAL record is added, users should be careful to upgrade standbys first, primaries later. Otherwise they risk the standby being unable to start if the primary happens to write such a record. A new TAP test that exercises this is added, but the portability of it is yet to be seen. This has been wrong since the introduction of physical replication, so backpatch all the way back. In stable branches, keep the new XLogReaderState members at the end of the struct, to avoid an ABI break. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Nathan Bossart <bossartn@amazon.com> Discussion: https://postgr.es/m/202108232252.dh7uxf6oxwcy@alvherre.pgsql
2021-09-29 16:21:51 +02:00
XLogFlush(recptr);
END_CRIT_SECTION();
return recptr;
}
/*
* Flush all data in shared memory to disk, and fsync
*
* This is the common code shared between regular checkpoints and
* recovery restartpoints.
*/
static void
CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
{
CheckPointRelationMap();
CheckPointReplicationSlots();
CheckPointSnapBuild();
CheckPointLogicalRewriteHeap();
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
CheckPointReplicationOrigin();
/* Write out all dirty data in SLRUs and the main buffer pool */
TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
CheckPointCLOG();
CheckPointCommitTs();
CheckPointSUBTRANS();
CheckPointMultiXact();
CheckPointPredicate();
CheckPointBuffers(flags);
/* Perform all queued up fsyncs */
TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
ProcessSyncRequests();
CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
/* We deliberately delay 2PC checkpointing as long as possible */
CheckPointTwoPhase(checkPointRedo);
}
/*
* Save a checkpoint for recovery restart if appropriate
*
* This function is called each time a checkpoint record is read from XLOG.
* It must determine whether the checkpoint represents a safe restartpoint or
* not. If so, the checkpoint record is stashed in shared memory so that
* CreateRestartPoint can consult it. (Note that the latter function is
* executed by the checkpointer, while this one will be executed by the
* startup process.)
*/
static void
xlog.c: Remove global variables ReadRecPtr and EndRecPtr. In most places, the variables necessarily store the same value as the eponymous members of the XLogReaderState that we use during WAL replay, because ReadRecord() assigns the values from the structure members to the global variables just after XLogReadRecord() returns. However, XLogBeginRead() adjusts the structure members but not the global variables, so after XLogBeginRead() and before the completion of XLogReadRecord() the values can differ. Otherwise, they must be identical. According to my analysis, the only place where either variable is referenced at a point where it might not have the same value as the structure member is the refrence to EndRecPtr within XLogPageRead. Therefore, at every other place where we are using the global variable, we can just switch to using the structure member instead, and remove the global variable. However, we can, and in fact should, do this in XLogPageRead() as well, because at that point in the code, the global variable will actually store the start of the record we want to read - either because it's where the last WAL record ended, or because the read position has been changed using XLogBeginRead since the last record was read. The structure member, on the other hand, will already have been updated to point to the end of the record we just read. Elsewhere, the latter is what we use as an argument to emode_for_corrupt_record(), so we should do the same here. This part of the patch is perhaps a bug fix, but I don't think it has any important consequences, so no back-patch. The point here is just to continue to whittle down the entirely excessive use of global variables in xlog.c. Discussion: http://postgr.es/m/CA+Tgmoao96EuNeSPd+hspRKcsCddu=b1h-QNRuKfY8VmfNQdfg@mail.gmail.com
2021-11-24 17:27:39 +01:00
RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record)
{
/*
* Also refrain from creating a restartpoint if we have seen any
* references to non-existent pages. Restarting recovery from the
* restartpoint would not see the references, so we would lose the
* cross-check that the pages belonged to a relation that was dropped
* later.
*/
if (XLogHaveInvalidPages())
{
elog(trace_recovery(DEBUG2),
"could not record restart point at %X/%X because there "
"are unresolved references to invalid pages",
LSN_FORMAT_ARGS(checkPoint->redo));
return;
}
/*
* Copy the checkpoint record to shared memory, so that checkpointer can
* work out the next time it wants to perform a restartpoint.
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
*/
SpinLockAcquire(&XLogCtl->info_lck);
xlog.c: Remove global variables ReadRecPtr and EndRecPtr. In most places, the variables necessarily store the same value as the eponymous members of the XLogReaderState that we use during WAL replay, because ReadRecord() assigns the values from the structure members to the global variables just after XLogReadRecord() returns. However, XLogBeginRead() adjusts the structure members but not the global variables, so after XLogBeginRead() and before the completion of XLogReadRecord() the values can differ. Otherwise, they must be identical. According to my analysis, the only place where either variable is referenced at a point where it might not have the same value as the structure member is the refrence to EndRecPtr within XLogPageRead. Therefore, at every other place where we are using the global variable, we can just switch to using the structure member instead, and remove the global variable. However, we can, and in fact should, do this in XLogPageRead() as well, because at that point in the code, the global variable will actually store the start of the record we want to read - either because it's where the last WAL record ended, or because the read position has been changed using XLogBeginRead since the last record was read. The structure member, on the other hand, will already have been updated to point to the end of the record we just read. Elsewhere, the latter is what we use as an argument to emode_for_corrupt_record(), so we should do the same here. This part of the patch is perhaps a bug fix, but I don't think it has any important consequences, so no back-patch. The point here is just to continue to whittle down the entirely excessive use of global variables in xlog.c. Discussion: http://postgr.es/m/CA+Tgmoao96EuNeSPd+hspRKcsCddu=b1h-QNRuKfY8VmfNQdfg@mail.gmail.com
2021-11-24 17:27:39 +01:00
XLogCtl->lastCheckPointRecPtr = record->ReadRecPtr;
XLogCtl->lastCheckPointEndPtr = record->EndRecPtr;
XLogCtl->lastCheckPoint = *checkPoint;
SpinLockRelease(&XLogCtl->info_lck);
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
}
/*
* Establish a restartpoint if possible.
*
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
* This is similar to CreateCheckPoint, but is used during WAL recovery
* to establish a point from which recovery can roll forward without
* replaying the entire recovery log.
*
* Returns true if a new restartpoint was established. We can only establish
* a restartpoint if we have replayed a safe checkpoint record since last
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
* restartpoint.
*/
bool
CreateRestartPoint(int flags)
{
XLogRecPtr lastCheckPointRecPtr;
XLogRecPtr lastCheckPointEndPtr;
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
CheckPoint lastCheckPoint;
XLogRecPtr PriorRedoPtr;
XLogRecPtr receivePtr;
XLogRecPtr replayPtr;
TimeLineID replayTLI;
XLogRecPtr endptr;
XLogSegNo _logSegNo;
TimestampTz xtime;
/* Get a local copy of the last safe checkpoint record. */
SpinLockAcquire(&XLogCtl->info_lck);
lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
lastCheckPoint = XLogCtl->lastCheckPoint;
SpinLockRelease(&XLogCtl->info_lck);
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/*
* Check that we're still in recovery mode. It's ok if we exit recovery
* mode after this check, the restart point is valid anyway.
*/
if (!RecoveryInProgress())
{
ereport(DEBUG2,
(errmsg_internal("skipping restartpoint, recovery has already ended")));
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
return false;
}
/*
* If the last checkpoint record we've replayed is already our last
* restartpoint, we can't perform a new restart point. We still update
* minRecoveryPoint in that case, so that if this is a shutdown restart
* point, we won't start up earlier than before. That's not strictly
* necessary, but when hot standby is enabled, it would be rather weird if
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
* the database opened up for read-only connections at a point-in-time
* before the last shutdown. Such time travel is still possible in case of
* immediate shutdown, though.
*
* We don't explicitly advance minRecoveryPoint when we do create a
* restartpoint. It's assumed that flushing the buffers will do that as a
* side-effect.
*/
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
{
ereport(DEBUG2,
(errmsg_internal("skipping restartpoint, already performed at %X/%X",
LSN_FORMAT_ARGS(lastCheckPoint.redo))));
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
if (flags & CHECKPOINT_IS_SHUTDOWN)
{
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
UpdateControlFile();
LWLockRelease(ControlFileLock);
}
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
return false;
}
/*
* Update the shared RedoRecPtr so that the startup process can calculate
* the number of segments replayed since last restartpoint, and request a
* restartpoint if it exceeds CheckPointSegments.
*
* Like in CreateCheckPoint(), hold off insertions to update it, although
* during recovery this is just pro forma, because no WAL insertions are
* happening.
*/
WALInsertLockAcquireExclusive();
RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
WALInsertLockRelease();
/* Also update the info_lck-protected copy */
SpinLockAcquire(&XLogCtl->info_lck);
XLogCtl->RedoRecPtr = lastCheckPoint.redo;
SpinLockRelease(&XLogCtl->info_lck);
/*
* Prepare to accumulate statistics.
*
* Note: because it is possible for log_checkpoints to change while a
* checkpoint proceeds, we always accumulate stats, even if
* log_checkpoints is currently off.
*/
MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
if (log_checkpoints)
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
LogCheckpointStart(flags, true);
/* Update the process title */
update_checkpoint_display(flags, true, false);
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
CheckPointGuts(lastCheckPoint.redo, flags);
/*
* Remember the prior checkpoint's redo ptr for
* UpdateCheckPointDistanceEstimate()
*/
PriorRedoPtr = ControlFile->checkPointCopy.redo;
/*
* Update pg_control, using current time. Check that it still shows
* DB_IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
* this is a quick hack to make sure nothing really bad happens if somehow
* we get here after the end-of-recovery checkpoint.
*/
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
{
ControlFile->checkPoint = lastCheckPointRecPtr;
ControlFile->checkPointCopy = lastCheckPoint;
/*
* Ensure minRecoveryPoint is past the checkpoint record. Normally,
* this will have happened already while writing out dirty buffers,
* but not necessarily - e.g. because no buffers were dirtied. We do
* this because a non-exclusive base backup uses minRecoveryPoint to
* determine which WAL files must be included in the backup, and the
* file (or files) containing the checkpoint record must be included,
* at a minimum. Note that for an ordinary restart of recovery there's
* no value in having the minimum recovery point any earlier than this
* anyway, because redo will begin just after the checkpoint record.
*/
if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
{
ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
/* update local copy */
LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
}
if (flags & CHECKPOINT_IS_SHUTDOWN)
ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
UpdateControlFile();
}
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
LWLockRelease(ControlFileLock);
/*
* Update the average distance between checkpoints/restartpoints if the
* prior checkpoint exists.
*/
if (PriorRedoPtr != InvalidXLogRecPtr)
UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
/*
* Delete old log files, those no longer needed for last restartpoint to
* prevent the disk holding the xlog from growing full.
*/
XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
/*
* Retreat _logSegNo using the current end of xlog replayed or received,
* whichever is later.
*/
receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
replayPtr = GetXLogReplayRecPtr(&replayTLI);
endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
KeepLogSeg(endptr, &_logSegNo);
if (InvalidateObsoleteReplicationSlots(_logSegNo))
{
/*
* Some slots have been invalidated; recalculate the old-segment
* horizon, starting again from RedoRecPtr.
*/
XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
KeepLogSeg(endptr, &_logSegNo);
}
_logSegNo--;
/*
* Try to recycle segments on a useful timeline. If we've been promoted
* since the beginning of this restartpoint, use the new timeline chosen
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
* at end of recovery. If we're still in recovery, use the timeline we're
* currently replaying.
*
* There is no guarantee that the WAL segments will be useful on the
* current timeline; if recovery proceeds to a new timeline right after
* this, the pre-allocated WAL segments on this timeline will not be used,
* and will go wasted until recycled on the next restartpoint. We'll live
* with that.
*/
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
if (!RecoveryInProgress())
replayTLI = XLogCtl->InsertTimeLineID;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr, replayTLI);
/*
* Make more log segments if needed. (Do this after recycling old log
* segments, since that may supply some of the needed files.)
*/
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
PreallocXlogFiles(endptr, replayTLI);
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/*
* Truncate pg_subtrans if possible. We can throw away all data before
* the oldest XMIN of any running transaction. No future transaction will
* attempt to reference any pg_subtrans entry older than that (see Asserts
* in subtrans.c). When hot standby is disabled, though, we mustn't do
* this because StartupSUBTRANS hasn't been called yet.
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
*/
if (EnableHotStandby)
snapshot scalability: Don't compute global horizons while building snapshots. To make GetSnapshotData() more scalable, it cannot not look at at each proc's xmin: While snapshot contents do not need to change whenever a read-only transaction commits or a snapshot is released, a proc's xmin is modified in those cases. The frequency of xmin modifications leads to, particularly on higher core count systems, many cache misses inside GetSnapshotData(), despite the data underlying a snapshot not changing. That is the most significant source of GetSnapshotData() scaling poorly on larger systems. Without accessing xmins, GetSnapshotData() cannot calculate accurate horizons / thresholds as it has so far. But we don't really have to: The horizons don't actually change that much between GetSnapshotData() calls. Nor are the horizons actually used every time a snapshot is built. The trick this commit introduces is to delay computation of accurate horizons until there use and using horizon boundaries to determine whether accurate horizons need to be computed. The use of RecentGlobal[Data]Xmin to decide whether a row version could be removed has been replaces with new GlobalVisTest* functions. These use two thresholds to determine whether a row can be pruned: 1) definitely_needed, indicating that rows deleted by XIDs >= definitely_needed are definitely still visible. 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can definitely be removed GetSnapshotData() updates definitely_needed to be the xmin of the computed snapshot. When testing whether a row can be removed (with GlobalVisTestIsRemovableXid()) and the tested XID falls in between the two (i.e. XID >= maybe_needed && XID < definitely_needed) the boundaries can be recomputed to be more accurate. As it is not cheap to compute accurate boundaries, we limit the number of times that happens in short succession. As the boundaries used by GlobalVisTestIsRemovableXid() are never reset (with maybe_needed updated by GetSnapshotData()), it is likely that further test can benefit from an earlier computation of accurate horizons. To avoid regressing performance when old_snapshot_threshold is set (as that requires an accurate horizon to be computed), heap_page_prune_opt() doesn't unconditionally call TransactionIdLimitedForOldSnapshots() anymore. Both the computation of the limited horizon, and the triggering of errors (with SetOldSnapshotThresholdTimestamp()) is now only done when necessary to remove tuples. This commit just removes the accesses to PGXACT->xmin from GetSnapshotData(), but other members of PGXACT residing in the same cache line are accessed. Therefore this in itself does not result in a significant improvement. Subsequent commits will take advantage of the fact that GetSnapshotData() now does not need to access xmins anymore. Note: This contains a workaround in heap_page_prune_opt() to keep the snapshot_too_old tests working. While that workaround is ugly, the tests currently are not meaningful, and it seems best to address them separately. Author: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Reviewed-By: Thomas Munro <thomas.munro@gmail.com> Reviewed-By: David Rowley <dgrowleyml@gmail.com> Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
2020-08-13 01:03:49 +02:00
TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/* Real work is done; log and update stats. */
LogCheckpointEnd(true);
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/* Reset the process title */
update_checkpoint_display(flags, true, true);
xtime = GetLatestXTime();
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
ereport((log_checkpoints ? LOG : DEBUG2),
(errmsg("recovery restart point at %X/%X",
LSN_FORMAT_ARGS(lastCheckPoint.redo)),
xtime ? errdetail("Last completed transaction was at log time %s.",
timestamptz_to_str(xtime)) : 0));
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
/*
* Finally, execute archive_cleanup_command, if any.
*/
if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
ExecuteRecoveryCommand(archiveCleanupCommand,
"archive_cleanup_command",
false,
WAIT_EVENT_ARCHIVE_CLEANUP_COMMAND);
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
return true;
}
/*
* Report availability of WAL for the given target LSN
* (typically a slot's restart_lsn)
*
* Returns one of the following enum values:
*
* * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
* max_wal_size.
*
* * WALAVAIL_EXTENDED means it is still available by preserving extra
* segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
* than max_wal_size, this state is not returned.
*
* * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
* remove reserved segments. The walsender using this slot may return to the
* above.
*
* * WALAVAIL_REMOVED means it has been removed. A replication stream on
* a slot with this LSN cannot continue after a restart.
*
* * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
*/
WALAvailability
GetWALAvailability(XLogRecPtr targetLSN)
{
XLogRecPtr currpos; /* current write LSN */
XLogSegNo currSeg; /* segid of currpos */
XLogSegNo targetSeg; /* segid of targetLSN */
XLogSegNo oldestSeg; /* actual oldest segid */
XLogSegNo oldestSegMaxWalSize; /* oldest segid kept by max_wal_size */
XLogSegNo oldestSlotSeg; /* oldest segid kept by slot */
uint64 keepSegs;
/*
* slot does not reserve WAL. Either deactivated, or has never been active
*/
if (XLogRecPtrIsInvalid(targetLSN))
return WALAVAIL_INVALID_LSN;
/*
* Calculate the oldest segment currently reserved by all slots,
* considering wal_keep_size and max_slot_wal_keep_size. Initialize
* oldestSlotSeg to the current segment.
*/
currpos = GetXLogWriteRecPtr();
XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
KeepLogSeg(currpos, &oldestSlotSeg);
/*
* Find the oldest extant segment file. We get 1 until checkpoint removes
* the first WAL segment file since startup, which causes the status being
* wrong under certain abnormal conditions but that doesn't actually harm.
*/
oldestSeg = XLogGetLastRemovedSegno() + 1;
/* calculate oldest segment by max_wal_size */
XLByteToSeg(currpos, currSeg, wal_segment_size);
keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
if (currSeg > keepSegs)
oldestSegMaxWalSize = currSeg - keepSegs;
else
oldestSegMaxWalSize = 1;
/* the segment we care about */
XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
/*
* No point in returning reserved or extended status values if the
* targetSeg is known to be lost.
*/
if (targetSeg >= oldestSlotSeg)
{
/* show "reserved" when targetSeg is within max_wal_size */
if (targetSeg >= oldestSegMaxWalSize)
return WALAVAIL_RESERVED;
/* being retained by slots exceeding max_wal_size */
return WALAVAIL_EXTENDED;
}
/* WAL segments are no longer retained but haven't been removed yet */
if (targetSeg >= oldestSeg)
return WALAVAIL_UNRESERVED;
/* Definitely lost */
return WALAVAIL_REMOVED;
}
/*
* Retreat *logSegNo to the last segment that we need to retain because of
* either wal_keep_size or replication slots.
*
* This is calculated by subtracting wal_keep_size from the given xlog
* location, recptr and by making sure that that result is below the
* requirement of replication slots. For the latter criterion we do consider
* the effects of max_slot_wal_keep_size: reserve at most that much space back
* from recptr.
*
* Note about replication slots: if this function calculates a value
* that's further ahead than what slots need reserved, then affected
* slots need to be invalidated and this function invoked again.
* XXX it might be a good idea to rewrite this function so that
* invalidation is optionally done here, instead.
*/
static void
KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
{
XLogSegNo currSegNo;
XLogSegNo segno;
XLogRecPtr keep;
XLByteToSeg(recptr, currSegNo, wal_segment_size);
segno = currSegNo;
/*
* Calculate how many segments are kept by slots first, adjusting for
* max_slot_wal_keep_size.
*/
keep = XLogGetReplicationSlotMinimumLSN();
if (keep != InvalidXLogRecPtr)
{
XLByteToSeg(keep, segno, wal_segment_size);
/* Cap by max_slot_wal_keep_size ... */
if (max_slot_wal_keep_size_mb >= 0)
{
uint64 slot_keep_segs;
slot_keep_segs =
ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
if (currSegNo - segno > slot_keep_segs)
segno = currSegNo - slot_keep_segs;
}
}
/* but, keep at least wal_keep_size if that's set */
if (wal_keep_size_mb > 0)
{
uint64 keep_segs;
keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
if (currSegNo - segno < keep_segs)
{
/* avoid underflow, don't go below 1 */
if (currSegNo <= keep_segs)
segno = 1;
else
segno = currSegNo - keep_segs;
}
}
/* don't delete WAL segments newer than the calculated segment */
if (segno < *logSegNo)
*logSegNo = segno;
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* Write a NEXTOID log record
*/
void
XLogPutNextOid(Oid nextOid)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogBeginInsert();
XLogRegisterData((char *) (&nextOid), sizeof(Oid));
(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
2005-10-15 04:49:52 +02:00
/*
* We need not flush the NEXTOID record immediately, because any of the
* just-allocated OIDs could only reach disk as part of a tuple insert or
* update that would have its own XLOG record that must follow the NEXTOID
* record. Therefore, the standard buffer LSN interlock applied to those
* records will ensure no such OID reaches disk before the NEXTOID record
* does.
*
* Note, however, that the above statement only covers state "within" the
* database. When we use a generated OID as a file or directory name, we
* are in a sense violating the basic WAL rule, because that filesystem
* change may reach disk before the NEXTOID WAL record does. The impact
* of this is that if a database crash occurs immediately afterward, we
* might after restart re-generate the same OID and find that it conflicts
* with the leftover file or directory. But since for safety's sake we
* always loop until finding a nonconflicting filename, this poses no real
* problem in practice. See pgsql-hackers discussion 27-Sep-2006.
*/
}
/*
* Write an XLOG SWITCH record.
*
* Here we just blindly issue an XLogInsert request for the record.
* All the magic happens inside XLogInsert.
*
* The return value is either the end+1 address of the switch record,
* or the end+1 address of the prior segment if we did not need to
* write a switch record because we are already at segment start.
*/
XLogRecPtr
RequestXLogSwitch(bool mark_unimportant)
{
XLogRecPtr RecPtr;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
/* XLOG SWITCH has no data */
XLogBeginInsert();
if (mark_unimportant)
XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
return RecPtr;
}
/*
* Write a RESTORE POINT record
*/
XLogRecPtr
XLogRestorePoint(const char *rpName)
{
XLogRecPtr RecPtr;
xl_restore_point xlrec;
xlrec.rp_time = GetCurrentTimestamp();
strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
ereport(LOG,
(errmsg("restore point \"%s\" created at %X/%X",
rpName, LSN_FORMAT_ARGS(RecPtr))));
return RecPtr;
}
/*
* Check if any of the GUC parameters that are critical for hot standby
* have changed, and update the value in pg_control file if necessary.
*/
static void
XLogReportParameters(void)
{
if (wal_level != ControlFile->wal_level ||
wal_log_hints != ControlFile->wal_log_hints ||
MaxConnections != ControlFile->MaxConnections ||
max_worker_processes != ControlFile->max_worker_processes ||
2019-02-12 02:07:56 +01:00
max_wal_senders != ControlFile->max_wal_senders ||
max_prepared_xacts != ControlFile->max_prepared_xacts ||
max_locks_per_xact != ControlFile->max_locks_per_xact ||
track_commit_timestamp != ControlFile->track_commit_timestamp)
{
/*
* The change in number of backend slots doesn't need to be WAL-logged
* if archiving is not enabled, as you can't start archive recovery
* with wal_level=minimal anyway. We don't really care about the
* values in pg_control either if wal_level=minimal, but seems better
* to keep them up-to-date to avoid confusion.
*/
if (wal_level != ControlFile->wal_level || XLogIsNeeded())
{
xl_parameter_change xlrec;
XLogRecPtr recptr;
xlrec.MaxConnections = MaxConnections;
xlrec.max_worker_processes = max_worker_processes;
2019-02-12 02:07:56 +01:00
xlrec.max_wal_senders = max_wal_senders;
xlrec.max_prepared_xacts = max_prepared_xacts;
xlrec.max_locks_per_xact = max_locks_per_xact;
xlrec.wal_level = wal_level;
xlrec.wal_log_hints = wal_log_hints;
xlrec.track_commit_timestamp = track_commit_timestamp;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, sizeof(xlrec));
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
XLogFlush(recptr);
}
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->MaxConnections = MaxConnections;
ControlFile->max_worker_processes = max_worker_processes;
2019-02-12 02:07:56 +01:00
ControlFile->max_wal_senders = max_wal_senders;
ControlFile->max_prepared_xacts = max_prepared_xacts;
ControlFile->max_locks_per_xact = max_locks_per_xact;
ControlFile->wal_level = wal_level;
ControlFile->wal_log_hints = wal_log_hints;
ControlFile->track_commit_timestamp = track_commit_timestamp;
UpdateControlFile();
LWLockRelease(ControlFileLock);
}
}
/*
* Update full_page_writes in shared memory, and write an
* XLOG_FPW_CHANGE record if necessary.
*
* Note: this function assumes there is no other process running
* concurrently that could update it.
*/
void
UpdateFullPageWrites(void)
{
XLogCtlInsert *Insert = &XLogCtl->Insert;
bool recoveryInProgress;
/*
* Do nothing if full_page_writes has not been changed.
*
* It's safe to check the shared full_page_writes without the lock,
* because we assume that there is no concurrently running process which
* can update it.
*/
if (fullPageWrites == Insert->fullPageWrites)
return;
/*
* Perform this outside critical section so that the WAL insert
* initialization done by RecoveryInProgress() doesn't trigger an
* assertion failure.
*/
recoveryInProgress = RecoveryInProgress();
START_CRIT_SECTION();
/*
* It's always safe to take full page images, even when not strictly
* required, but not the other round. So if we're setting full_page_writes
* to true, first set it true and then write the WAL record. If we're
* setting it to false, first write the WAL record and then set the global
* flag.
*/
if (fullPageWrites)
{
WALInsertLockAcquireExclusive();
Insert->fullPageWrites = true;
WALInsertLockRelease();
}
/*
* Write an XLOG_FPW_CHANGE record. This allows us to keep track of
* full_page_writes during archive recovery, if required.
*/
if (XLogStandbyInfoActive() && !recoveryInProgress)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogBeginInsert();
XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
}
if (!fullPageWrites)
{
WALInsertLockAcquireExclusive();
Insert->fullPageWrites = false;
WALInsertLockRelease();
}
END_CRIT_SECTION();
}
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
/*
* XLOG resource manager's routines
Start background writer during archive recovery. Background writer now performs its usual buffer cleaning duties during archive recovery, and it's responsible for performing restartpoints. This requires some changes in postmaster. When the startup process has done all the initialization and is ready to start WAL redo, it signals the postmaster to launch the background writer. The postmaster is signaled again when the point in recovery is reached where we know that the database is in consistent state. Postmaster isn't interested in that at the moment, but that's the point where we could let other backends in to perform read-only queries. The postmaster is signaled third time when the recovery has ended, so that postmaster knows that it's safe to start accepting connections. The startup process now traps SIGTERM, and performs a "clean" shutdown. If you do a fast shutdown during recovery, a shutdown restartpoint is performed, like a shutdown checkpoint, and postmaster kills the processes cleanly. You still have to continue the recovery at next startup, though. Currently, the background writer is only launched during archive recovery. We could launch it during crash recovery as well, but it seems better to keep that codepath as simple as possible, for the sake of robustness. And it couldn't do any restartpoints during crash recovery anyway, so it wouldn't be that useful. log_restartpoints is gone. Use log_checkpoints instead. This is yet to be documented. This whole operation is a pre-requisite for Hot Standby, but has some value of its own whether the hot standby patch makes 8.4 or not. Simon Riggs, with lots of modifications by me.
2009-02-18 16:58:41 +01:00
*
* Definitions of info values are in include/catalog/pg_control.h, though
* not all record types are related to control file updates.
*
* NOTE: Some XLOG record types that are directly related to WAL recovery
* are handled in xlogrecovery_redo().
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
*/
2000-10-21 17:43:36 +02:00
void
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
xlog_redo(XLogReaderState *record)
2000-10-21 17:43:36 +02:00
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
XLogRecPtr lsn = record->EndRecPtr;
/*
* In XLOG rmgr, backup blocks are only used by XLOG_FPI and
* XLOG_FPI_FOR_HINT records.
*/
Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
!XLogRecHasAnyBlockRefs(record));
if (info == XLOG_NEXTOID)
{
Oid nextOid;
/*
* We used to try to take the maximum of ShmemVariableCache->nextOid
* and the recorded nextOid, but that fails if the OID counter wraps
* around. Since no OID allocation should be happening during replay
* anyway, better to just believe the record exactly. We still take
* OidGenLock while setting the variable, just in case.
*/
memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
ShmemVariableCache->nextOid = nextOid;
ShmemVariableCache->oidCount = 0;
LWLockRelease(OidGenLock);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
}
else if (info == XLOG_CHECKPOINT_SHUTDOWN)
{
CheckPoint checkPoint;
TimeLineID replayTLI;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
/* In a SHUTDOWN checkpoint, believe the counters exactly */
LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
ShmemVariableCache->nextXid = checkPoint.nextXid;
LWLockRelease(XidGenLock);
LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
ShmemVariableCache->nextOid = checkPoint.nextOid;
ShmemVariableCache->oidCount = 0;
LWLockRelease(OidGenLock);
MultiXactSetNextMXact(checkPoint.nextMulti,
checkPoint.nextMultiOffset);
Rework the way multixact truncations work. The fact that multixact truncations are not WAL logged has caused a fair share of problems. Amongst others it requires to do computations during recovery while the database is not in a consistent state, delaying truncations till checkpoints, and handling members being truncated, but offset not. We tried to put bandaids on lots of these issues over the last years, but it seems time to change course. Thus this patch introduces WAL logging for multixact truncations. This allows: 1) to perform the truncation directly during VACUUM, instead of delaying it to the checkpoint. 2) to avoid looking at the offsets SLRU for truncation during recovery, we can just use the master's values. 3) simplify a fair amount of logic to keep in memory limits straight, this has gotten much easier During the course of fixing this a bunch of additional bugs had to be fixed: 1) Data was not purged from memory the member's SLRU before deleting segments. This happened to be hard or impossible to hit due to the interlock between checkpoints and truncation. 2) find_multixact_start() relied on SimpleLruDoesPhysicalPageExist - but that doesn't work for offsets that haven't yet been flushed to disk. Add code to flush the SLRUs to fix. Not pretty, but it feels slightly safer to only make decisions based on actual on-disk state. 3) find_multixact_start() could be called concurrently with a truncation and thus fail. Via SetOffsetVacuumLimit() that could lead to a round of emergency vacuuming. The problem remains in pg_get_multixact_members(), but that's quite harmless. For now this is going to only get applied to 9.5+, leaving the issues in the older branches in place. It is quite possible that we need to backpatch at a later point though. For the case this gets backpatched we need to handle that an updated standby may be replaying WAL from a not-yet upgraded primary. We have to recognize that situation and use "old style" truncation (i.e. looking at the SLRUs) during WAL replay. In contrast to before, this now happens in the startup process, when replaying a checkpoint record, instead of the checkpointer. Doing truncation in the restartpoint is incorrect, they can happen much later than the original checkpoint, thereby leading to wraparound. To avoid "multixact_redo: unknown op code 48" errors standbys would have to be upgraded before primaries. A later patch will bump the WAL page magic, and remove the legacy truncation codepaths. Legacy truncation support is just included to make a possible future backpatch easier. Discussion: 20150621192409.GA4797@alap3.anarazel.de Reviewed-By: Robert Haas, Alvaro Herrera, Thomas Munro Backpatch: 9.5 for now
2015-09-26 19:04:25 +02:00
MultiXactAdvanceOldest(checkPoint.oldestMulti,
checkPoint.oldestMultiDB);
/*
* No need to set oldestClogXid here as well; it'll be set when we
* redo an xl_clog_truncate if it changed since initialization.
*/
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
2004-08-29 07:07:03 +02:00
/*
* If we see a shutdown checkpoint while waiting for an end-of-backup
* record, the backup was canceled and the end-of-backup record will
* never arrive.
*/
If recovery.conf is created after "pg_ctl stop -m i", do crash recovery. If you create a base backup using an atomic filesystem snapshot, and try to perform PITR starting from that base backup, or if you just kill a master server and create recovery.conf to put it into standby mode, we don't know how far we need to recover before reaching consistency. Normally in crash recovery, we replay all the WAL present in pg_xlog, and assume that we're consistent after that. And normally in archive recovery, minRecoveryPoint, backupEndRequired, or backupEndPoint is set in the control file, indicating how far we need to replay to reach consistency. But if the server was previously up and running normally, and you kill -9 it or take an atomic filesystem snapshot, none of those fields are set in the control file. The solution is to perform crash recovery first, replaying all the WAL in pg_xlog. After that's done, we assume that the system is consistent like in normal crash recovery, and switch to archive recovery mode after that. Per report from Kyotaro HORIGUCHI. In his scenario, recovery.conf was created after "pg_ctl stop -m i". I'm not sure we need to support that exact scenario, but we should support backing up using a filesystem snapshot, which looks identical. This issue goes back to at least 9.0, where hot standby was introduced and we started to track when consistency is reached. In 9.1 and 9.2, we would open up for hot standby too early, and queries could briefly see an inconsistent state. But 9.2 made it more visible, as we started to PANIC if we see a reference to a non-existing page during recovery, if we've already reached consistency. This is a fairly big patch, so back-patch to 9.2 only, where the issue is more visible. We can consider back-patching further after this has received some more testing in 9.2 and master.
2013-02-22 10:43:04 +01:00
if (ArchiveRecoveryRequested &&
!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
ereport(PANIC,
(errmsg("online backup was canceled, recovery cannot continue")));
/*
* If we see a shutdown checkpoint, we know that nothing was running
* on the primary at this point. So fake-up an empty running-xacts
* record and use that here and now. Recover additional standby state
* for prepared transactions.
*/
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
if (standbyState >= STANDBY_INITIALIZED)
{
TransactionId *xids;
int nxids;
TransactionId oldestActiveXID;
TransactionId latestCompletedXid;
RunningTransactionsData running;
oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
/*
* Construct a RunningTransactions snapshot representing a shut
* down server, with only prepared transactions still alive. We're
* never overflowed at this point because all subxids are listed
* with their parent prepared transactions.
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
*/
running.xcnt = nxids;
running.subxcnt = 0;
running.subxid_overflow = false;
running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
running.oldestRunningXid = oldestActiveXID;
latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
TransactionIdRetreat(latestCompletedXid);
Assert(TransactionIdIsNormal(latestCompletedXid));
running.latestCompletedXid = latestCompletedXid;
running.xids = xids;
ProcArrayApplyRecoveryInfo(&running);
StandbyRecoverPreparedTransactions();
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
}
/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
LWLockRelease(ControlFileLock);
/* Update shared-memory copy of checkpoint XID/epoch */
SpinLockAcquire(&XLogCtl->info_lck);
XLogCtl->ckptFullXid = checkPoint.nextXid;
SpinLockRelease(&XLogCtl->info_lck);
/*
* We should've already switched to the new TLI before replaying this
* record.
*/
(void) GetCurrentReplayRecPtr(&replayTLI);
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
if (checkPoint.ThisTimeLineID != replayTLI)
ereport(PANIC,
(errmsg("unexpected timeline ID %u (should be %u) in shutdown checkpoint record",
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
checkPoint.ThisTimeLineID, replayTLI)));
xlog.c: Remove global variables ReadRecPtr and EndRecPtr. In most places, the variables necessarily store the same value as the eponymous members of the XLogReaderState that we use during WAL replay, because ReadRecord() assigns the values from the structure members to the global variables just after XLogReadRecord() returns. However, XLogBeginRead() adjusts the structure members but not the global variables, so after XLogBeginRead() and before the completion of XLogReadRecord() the values can differ. Otherwise, they must be identical. According to my analysis, the only place where either variable is referenced at a point where it might not have the same value as the structure member is the refrence to EndRecPtr within XLogPageRead. Therefore, at every other place where we are using the global variable, we can just switch to using the structure member instead, and remove the global variable. However, we can, and in fact should, do this in XLogPageRead() as well, because at that point in the code, the global variable will actually store the start of the record we want to read - either because it's where the last WAL record ended, or because the read position has been changed using XLogBeginRead since the last record was read. The structure member, on the other hand, will already have been updated to point to the end of the record we just read. Elsewhere, the latter is what we use as an argument to emode_for_corrupt_record(), so we should do the same here. This part of the patch is perhaps a bug fix, but I don't think it has any important consequences, so no back-patch. The point here is just to continue to whittle down the entirely excessive use of global variables in xlog.c. Discussion: http://postgr.es/m/CA+Tgmoao96EuNeSPd+hspRKcsCddu=b1h-QNRuKfY8VmfNQdfg@mail.gmail.com
2021-11-24 17:27:39 +01:00
RecoveryRestartPoint(&checkPoint, record);
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
}
else if (info == XLOG_CHECKPOINT_ONLINE)
{
CheckPoint checkPoint;
TimeLineID replayTLI;
XLOG (and related) changes: * Store two past checkpoint locations, not just one, in pg_control. On startup, we fall back to the older checkpoint if the newer one is unreadable. Also, a physical copy of the newest checkpoint record is kept in pg_control for possible use in disaster recovery (ie, complete loss of pg_xlog). Also add a version number for pg_control itself. Remove archdir from pg_control; it ought to be a GUC parameter, not a special case (not that it's implemented yet anyway). * Suppress successive checkpoint records when nothing has been entered in the WAL log since the last one. This is not so much to avoid I/O as to make it actually useful to keep track of the last two checkpoints. If the things are right next to each other then there's not a lot of redundancy gained... * Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs on alternate bytes. Polynomial borrowed from ECMA DLT1 standard. * Fix XLOG record length handling so that it will work at BLCKSZ = 32k. * Change XID allocation to work more like OID allocation. (This is of dubious necessity, but I think it's a good idea anyway.) * Fix a number of minor bugs, such as off-by-one logic for XLOG file wraparound at the 4 gig mark. * Add documentation and clean up some coding infelicities; move file format declarations out to include files where planned contrib utilities can get at them. * Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also possible to force a checkpoint by sending SIGUSR1 to the postmaster (undocumented feature...) * Defend against kill -9 postmaster by storing shmem block's key and ID in postmaster.pid lockfile, and checking at startup to ensure that no processes are still connected to old shmem block (if it still exists). * Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency stop, for symmetry with postmaster and xlog utilities. Clean up signal handling in bootstrap.c so that xlog utilities launched by postmaster will react to signals better. * Standalone bootstrap now grabs lockfile in target directory, as added insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
/* In an ONLINE checkpoint, treat the XID counter as a minimum */
LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
if (FullTransactionIdPrecedes(ShmemVariableCache->nextXid,
checkPoint.nextXid))
ShmemVariableCache->nextXid = checkPoint.nextXid;
LWLockRelease(XidGenLock);
/*
* We ignore the nextOid counter in an ONLINE checkpoint, preferring
* to track OID assignment through XLOG_NEXTOID records. The nextOid
* counter is from the start of the checkpoint and might well be stale
* compared to later XLOG_NEXTOID records. We could try to take the
* maximum of the nextOid counter and our latest value, but since
* there's no particular guarantee about the speed with which the OID
* counter wraps around, that's a risky thing to do. In any case,
* users of the nextOid counter are required to avoid assignment of
* duplicates, so that a somewhat out-of-date value should be safe.
*/
/* Handle multixact */
MultiXactAdvanceNextMXact(checkPoint.nextMulti,
checkPoint.nextMultiOffset);
Rework the way multixact truncations work. The fact that multixact truncations are not WAL logged has caused a fair share of problems. Amongst others it requires to do computations during recovery while the database is not in a consistent state, delaying truncations till checkpoints, and handling members being truncated, but offset not. We tried to put bandaids on lots of these issues over the last years, but it seems time to change course. Thus this patch introduces WAL logging for multixact truncations. This allows: 1) to perform the truncation directly during VACUUM, instead of delaying it to the checkpoint. 2) to avoid looking at the offsets SLRU for truncation during recovery, we can just use the master's values. 3) simplify a fair amount of logic to keep in memory limits straight, this has gotten much easier During the course of fixing this a bunch of additional bugs had to be fixed: 1) Data was not purged from memory the member's SLRU before deleting segments. This happened to be hard or impossible to hit due to the interlock between checkpoints and truncation. 2) find_multixact_start() relied on SimpleLruDoesPhysicalPageExist - but that doesn't work for offsets that haven't yet been flushed to disk. Add code to flush the SLRUs to fix. Not pretty, but it feels slightly safer to only make decisions based on actual on-disk state. 3) find_multixact_start() could be called concurrently with a truncation and thus fail. Via SetOffsetVacuumLimit() that could lead to a round of emergency vacuuming. The problem remains in pg_get_multixact_members(), but that's quite harmless. For now this is going to only get applied to 9.5+, leaving the issues in the older branches in place. It is quite possible that we need to backpatch at a later point though. For the case this gets backpatched we need to handle that an updated standby may be replaying WAL from a not-yet upgraded primary. We have to recognize that situation and use "old style" truncation (i.e. looking at the SLRUs) during WAL replay. In contrast to before, this now happens in the startup process, when replaying a checkpoint record, instead of the checkpointer. Doing truncation in the restartpoint is incorrect, they can happen much later than the original checkpoint, thereby leading to wraparound. To avoid "multixact_redo: unknown op code 48" errors standbys would have to be upgraded before primaries. A later patch will bump the WAL page magic, and remove the legacy truncation codepaths. Legacy truncation support is just included to make a possible future backpatch easier. Discussion: 20150621192409.GA4797@alap3.anarazel.de Reviewed-By: Robert Haas, Alvaro Herrera, Thomas Munro Backpatch: 9.5 for now
2015-09-26 19:04:25 +02:00
/*
* NB: This may perform multixact truncation when replaying WAL
* generated by an older primary.
*/
MultiXactAdvanceOldest(checkPoint.oldestMulti,
checkPoint.oldestMultiDB);
if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
checkPoint.oldestXid))
SetTransactionIdLimit(checkPoint.oldestXid,
checkPoint.oldestXidDB);
/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
LWLockRelease(ControlFileLock);
/* Update shared-memory copy of checkpoint XID/epoch */
SpinLockAcquire(&XLogCtl->info_lck);
XLogCtl->ckptFullXid = checkPoint.nextXid;
SpinLockRelease(&XLogCtl->info_lck);
/* TLI should not change in an on-line checkpoint */
(void) GetCurrentReplayRecPtr(&replayTLI);
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
if (checkPoint.ThisTimeLineID != replayTLI)
ereport(PANIC,
(errmsg("unexpected timeline ID %u (should be %u) in online checkpoint record",
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
checkPoint.ThisTimeLineID, replayTLI)));
xlog.c: Remove global variables ReadRecPtr and EndRecPtr. In most places, the variables necessarily store the same value as the eponymous members of the XLogReaderState that we use during WAL replay, because ReadRecord() assigns the values from the structure members to the global variables just after XLogReadRecord() returns. However, XLogBeginRead() adjusts the structure members but not the global variables, so after XLogBeginRead() and before the completion of XLogReadRecord() the values can differ. Otherwise, they must be identical. According to my analysis, the only place where either variable is referenced at a point where it might not have the same value as the structure member is the refrence to EndRecPtr within XLogPageRead. Therefore, at every other place where we are using the global variable, we can just switch to using the structure member instead, and remove the global variable. However, we can, and in fact should, do this in XLogPageRead() as well, because at that point in the code, the global variable will actually store the start of the record we want to read - either because it's where the last WAL record ended, or because the read position has been changed using XLogBeginRead since the last record was read. The structure member, on the other hand, will already have been updated to point to the end of the record we just read. Elsewhere, the latter is what we use as an argument to emode_for_corrupt_record(), so we should do the same here. This part of the patch is perhaps a bug fix, but I don't think it has any important consequences, so no back-patch. The point here is just to continue to whittle down the entirely excessive use of global variables in xlog.c. Discussion: http://postgr.es/m/CA+Tgmoao96EuNeSPd+hspRKcsCddu=b1h-QNRuKfY8VmfNQdfg@mail.gmail.com
2021-11-24 17:27:39 +01:00
RecoveryRestartPoint(&checkPoint, record);
}
Fix WAL replay in presence of an incomplete record Physical replication always ships WAL segment files to replicas once they are complete. This is a problem if one WAL record is split across a segment boundary and the primary server crashes before writing down the segment with the next portion of the WAL record: WAL writing after crash recovery would happily resume at the point where the broken record started, overwriting that record ... but any standby or backup may have already received a copy of that segment, and they are not rewinding. This causes standbys to stop following the primary after the latter crashes: LOG: invalid contrecord length 7262 at A8/D9FFFBC8 because the standby is still trying to read the continuation record (contrecord) for the original long WAL record, but it is not there and it will never be. A workaround is to stop the replica, delete the WAL file, and restart it -- at which point a fresh copy is brought over from the primary. But that's pretty labor intensive, and I bet many users would just give up and re-clone the standby instead. A fix for this problem was already attempted in commit 515e3d84a0b5, but it only addressed the case for the scenario of WAL archiving, so streaming replication would still be a problem (as well as other things such as taking a filesystem-level backup while the server is down after having crashed), and it had performance scalability problems too; so it had to be reverted. This commit fixes the problem using an approach suggested by Andres Freund, whereby the initial portion(s) of the split-up WAL record are kept, and a special type of WAL record is written where the contrecord was lost, so that WAL replay in the replica knows to skip the broken parts. With this approach, we can continue to stream/archive segment files as soon as they are complete, and replay of the broken records will proceed across the crash point without a hitch. Because a new type of WAL record is added, users should be careful to upgrade standbys first, primaries later. Otherwise they risk the standby being unable to start if the primary happens to write such a record. A new TAP test that exercises this is added, but the portability of it is yet to be seen. This has been wrong since the introduction of physical replication, so backpatch all the way back. In stable branches, keep the new XLogReaderState members at the end of the struct, to avoid an ABI break. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Nathan Bossart <bossartn@amazon.com> Discussion: https://postgr.es/m/202108232252.dh7uxf6oxwcy@alvherre.pgsql
2021-09-29 16:21:51 +02:00
else if (info == XLOG_OVERWRITE_CONTRECORD)
{
/* nothing to do here, handled in xlogrecovery_redo() */
Fix WAL replay in presence of an incomplete record Physical replication always ships WAL segment files to replicas once they are complete. This is a problem if one WAL record is split across a segment boundary and the primary server crashes before writing down the segment with the next portion of the WAL record: WAL writing after crash recovery would happily resume at the point where the broken record started, overwriting that record ... but any standby or backup may have already received a copy of that segment, and they are not rewinding. This causes standbys to stop following the primary after the latter crashes: LOG: invalid contrecord length 7262 at A8/D9FFFBC8 because the standby is still trying to read the continuation record (contrecord) for the original long WAL record, but it is not there and it will never be. A workaround is to stop the replica, delete the WAL file, and restart it -- at which point a fresh copy is brought over from the primary. But that's pretty labor intensive, and I bet many users would just give up and re-clone the standby instead. A fix for this problem was already attempted in commit 515e3d84a0b5, but it only addressed the case for the scenario of WAL archiving, so streaming replication would still be a problem (as well as other things such as taking a filesystem-level backup while the server is down after having crashed), and it had performance scalability problems too; so it had to be reverted. This commit fixes the problem using an approach suggested by Andres Freund, whereby the initial portion(s) of the split-up WAL record are kept, and a special type of WAL record is written where the contrecord was lost, so that WAL replay in the replica knows to skip the broken parts. With this approach, we can continue to stream/archive segment files as soon as they are complete, and replay of the broken records will proceed across the crash point without a hitch. Because a new type of WAL record is added, users should be careful to upgrade standbys first, primaries later. Otherwise they risk the standby being unable to start if the primary happens to write such a record. A new TAP test that exercises this is added, but the portability of it is yet to be seen. This has been wrong since the introduction of physical replication, so backpatch all the way back. In stable branches, keep the new XLogReaderState members at the end of the struct, to avoid an ABI break. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Nathan Bossart <bossartn@amazon.com> Discussion: https://postgr.es/m/202108232252.dh7uxf6oxwcy@alvherre.pgsql
2021-09-29 16:21:51 +02:00
}
else if (info == XLOG_END_OF_RECOVERY)
{
xl_end_of_recovery xlrec;
TimeLineID replayTLI;
memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
/*
* For Hot Standby, we could treat this like a Shutdown Checkpoint,
* but this case is rarer and harder to test, so the benefit doesn't
* outweigh the potential extra cost of maintenance.
*/
/*
* We should've already switched to the new TLI before replaying this
* record.
*/
(void) GetCurrentReplayRecPtr(&replayTLI);
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
if (xlrec.ThisTimeLineID != replayTLI)
ereport(PANIC,
(errmsg("unexpected timeline ID %u (should be %u) in end-of-recovery record",
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
xlrec.ThisTimeLineID, replayTLI)));
}
else if (info == XLOG_NOOP)
{
/* nothing to do here */
}
else if (info == XLOG_SWITCH)
{
/* nothing to do here */
}
else if (info == XLOG_RESTORE_POINT)
{
/* nothing to do here, handled in xlogrecovery.c */
}
else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
{
/*
* XLOG_FPI records contain nothing else but one or more block
* references. Every block reference must include a full-page image
* even if full_page_writes was disabled when the record was generated
* - otherwise there would be no point in this record.
*
* XLOG_FPI_FOR_HINT records are generated when a page needs to be
* WAL-logged because of a hint bit update. They are only generated
* when checksums and/or wal_log_hints are enabled. They may include
* no full-page images if full_page_writes was disabled when they were
* generated. In this case there is nothing to do here.
*
* No recovery conflicts are generated by these generic records - if a
* resource manager needs to generate conflicts, it has to define a
* separate WAL record type and redo routine.
*/
for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
{
Buffer buffer;
if (!XLogRecHasBlockImage(record, block_id))
{
if (info == XLOG_FPI)
elog(ERROR, "XLOG_FPI record did not contain a full-page image");
continue;
}
if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
UnlockReleaseBuffer(buffer);
}
}
else if (info == XLOG_BACKUP_END)
{
/* nothing to do here, handled in xlogrecovery_redo() */
}
else if (info == XLOG_PARAMETER_CHANGE)
{
xl_parameter_change xlrec;
/* Update our copy of the parameters in pg_control */
memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->MaxConnections = xlrec.MaxConnections;
ControlFile->max_worker_processes = xlrec.max_worker_processes;
2019-02-12 02:07:56 +01:00
ControlFile->max_wal_senders = xlrec.max_wal_senders;
ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
ControlFile->wal_level = xlrec.wal_level;
ControlFile->wal_log_hints = xlrec.wal_log_hints;
2010-07-06 21:19:02 +02:00
/*
* Update minRecoveryPoint to ensure that if recovery is aborted, we
* recover back up to this point before allowing hot standby again.
* This is important if the max_* settings are decreased, to ensure
Prevent references to invalid relation pages after fresh promotion If a standby crashes after promotion before having completed its first post-recovery checkpoint, then the minimal recovery point which marks the LSN position where the cluster is able to reach consistency may be set to a position older than the first end-of-recovery checkpoint while all the WAL available should be replayed. This leads to the instance thinking that it contains inconsistent pages, causing a PANIC and a hard instance crash even if all the WAL available has not been replayed for certain sets of records replayed. When in crash recovery, minRecoveryPoint is expected to always be set to InvalidXLogRecPtr, which forces the recovery to replay all the WAL available, so this commit makes sure that the local copy of minRecoveryPoint from the control file is initialized properly and stays as it is while crash recovery is performed. Once switching to archive recovery or if crash recovery finishes, then the local copy minRecoveryPoint can be safely updated. Pavan Deolasee has reported and diagnosed the failure in the first place, and the base fix idea to rely on the local copy of minRecoveryPoint comes from Kyotaro Horiguchi, which has been expanded into a full-fledged patch by me. The test included in this commit has been written by Álvaro Herrera and Pavan Deolasee, which I have modified to make it faster and more reliable with sleep phases. Backpatch down to all supported versions where the bug appears, aka 9.3 which is where the end-of-recovery checkpoint is not run by the startup process anymore. The test gets easily supported down to 10, still it has been tested on all branches. Reported-by: Pavan Deolasee Diagnosed-by: Pavan Deolasee Reviewed-by: Pavan Deolasee, Kyotaro Horiguchi Author: Michael Paquier, Kyotaro Horiguchi, Pavan Deolasee, Álvaro Herrera Discussion: https://postgr.es/m/CABOikdPOewjNL=05K5CbNMxnNtXnQjhTx2F--4p4ruorCjukbA@mail.gmail.com
2018-07-05 03:46:18 +02:00
* you don't run queries against the WAL preceding the change. The
* local copies cannot be updated as long as crash recovery is
* happening and we expect all the WAL to be replayed.
*/
Prevent references to invalid relation pages after fresh promotion If a standby crashes after promotion before having completed its first post-recovery checkpoint, then the minimal recovery point which marks the LSN position where the cluster is able to reach consistency may be set to a position older than the first end-of-recovery checkpoint while all the WAL available should be replayed. This leads to the instance thinking that it contains inconsistent pages, causing a PANIC and a hard instance crash even if all the WAL available has not been replayed for certain sets of records replayed. When in crash recovery, minRecoveryPoint is expected to always be set to InvalidXLogRecPtr, which forces the recovery to replay all the WAL available, so this commit makes sure that the local copy of minRecoveryPoint from the control file is initialized properly and stays as it is while crash recovery is performed. Once switching to archive recovery or if crash recovery finishes, then the local copy minRecoveryPoint can be safely updated. Pavan Deolasee has reported and diagnosed the failure in the first place, and the base fix idea to rely on the local copy of minRecoveryPoint comes from Kyotaro Horiguchi, which has been expanded into a full-fledged patch by me. The test included in this commit has been written by Álvaro Herrera and Pavan Deolasee, which I have modified to make it faster and more reliable with sleep phases. Backpatch down to all supported versions where the bug appears, aka 9.3 which is where the end-of-recovery checkpoint is not run by the startup process anymore. The test gets easily supported down to 10, still it has been tested on all branches. Reported-by: Pavan Deolasee Diagnosed-by: Pavan Deolasee Reviewed-by: Pavan Deolasee, Kyotaro Horiguchi Author: Michael Paquier, Kyotaro Horiguchi, Pavan Deolasee, Álvaro Herrera Discussion: https://postgr.es/m/CABOikdPOewjNL=05K5CbNMxnNtXnQjhTx2F--4p4ruorCjukbA@mail.gmail.com
2018-07-05 03:46:18 +02:00
if (InArchiveRecovery)
{
LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
Prevent references to invalid relation pages after fresh promotion If a standby crashes after promotion before having completed its first post-recovery checkpoint, then the minimal recovery point which marks the LSN position where the cluster is able to reach consistency may be set to a position older than the first end-of-recovery checkpoint while all the WAL available should be replayed. This leads to the instance thinking that it contains inconsistent pages, causing a PANIC and a hard instance crash even if all the WAL available has not been replayed for certain sets of records replayed. When in crash recovery, minRecoveryPoint is expected to always be set to InvalidXLogRecPtr, which forces the recovery to replay all the WAL available, so this commit makes sure that the local copy of minRecoveryPoint from the control file is initialized properly and stays as it is while crash recovery is performed. Once switching to archive recovery or if crash recovery finishes, then the local copy minRecoveryPoint can be safely updated. Pavan Deolasee has reported and diagnosed the failure in the first place, and the base fix idea to rely on the local copy of minRecoveryPoint comes from Kyotaro Horiguchi, which has been expanded into a full-fledged patch by me. The test included in this commit has been written by Álvaro Herrera and Pavan Deolasee, which I have modified to make it faster and more reliable with sleep phases. Backpatch down to all supported versions where the bug appears, aka 9.3 which is where the end-of-recovery checkpoint is not run by the startup process anymore. The test gets easily supported down to 10, still it has been tested on all branches. Reported-by: Pavan Deolasee Diagnosed-by: Pavan Deolasee Reviewed-by: Pavan Deolasee, Kyotaro Horiguchi Author: Michael Paquier, Kyotaro Horiguchi, Pavan Deolasee, Álvaro Herrera Discussion: https://postgr.es/m/CABOikdPOewjNL=05K5CbNMxnNtXnQjhTx2F--4p4ruorCjukbA@mail.gmail.com
2018-07-05 03:46:18 +02:00
}
if (LocalMinRecoveryPoint != InvalidXLogRecPtr && LocalMinRecoveryPoint < lsn)
{
TimeLineID replayTLI;
(void) GetCurrentReplayRecPtr(&replayTLI);
ControlFile->minRecoveryPoint = lsn;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
ControlFile->minRecoveryPointTLI = replayTLI;
}
CommitTsParameterChange(xlrec.track_commit_timestamp,
ControlFile->track_commit_timestamp);
ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
UpdateControlFile();
LWLockRelease(ControlFileLock);
2019-02-12 02:07:56 +01:00
/* Check to see if any parameter change gives a problem on recovery */
CheckRequiredParameterValues();
}
else if (info == XLOG_FPW_CHANGE)
{
bool fpw;
memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
/*
* Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
* do_pg_start_backup() and do_pg_stop_backup() can check whether
* full_page_writes has been disabled during online backup.
*/
if (!fpw)
{
SpinLockAcquire(&XLogCtl->info_lck);
xlog.c: Remove global variables ReadRecPtr and EndRecPtr. In most places, the variables necessarily store the same value as the eponymous members of the XLogReaderState that we use during WAL replay, because ReadRecord() assigns the values from the structure members to the global variables just after XLogReadRecord() returns. However, XLogBeginRead() adjusts the structure members but not the global variables, so after XLogBeginRead() and before the completion of XLogReadRecord() the values can differ. Otherwise, they must be identical. According to my analysis, the only place where either variable is referenced at a point where it might not have the same value as the structure member is the refrence to EndRecPtr within XLogPageRead. Therefore, at every other place where we are using the global variable, we can just switch to using the structure member instead, and remove the global variable. However, we can, and in fact should, do this in XLogPageRead() as well, because at that point in the code, the global variable will actually store the start of the record we want to read - either because it's where the last WAL record ended, or because the read position has been changed using XLogBeginRead since the last record was read. The structure member, on the other hand, will already have been updated to point to the end of the record we just read. Elsewhere, the latter is what we use as an argument to emode_for_corrupt_record(), so we should do the same here. This part of the patch is perhaps a bug fix, but I don't think it has any important consequences, so no back-patch. The point here is just to continue to whittle down the entirely excessive use of global variables in xlog.c. Discussion: http://postgr.es/m/CA+Tgmoao96EuNeSPd+hspRKcsCddu=b1h-QNRuKfY8VmfNQdfg@mail.gmail.com
2021-11-24 17:27:39 +01:00
if (XLogCtl->lastFpwDisableRecPtr < record->ReadRecPtr)
XLogCtl->lastFpwDisableRecPtr = record->ReadRecPtr;
SpinLockRelease(&XLogCtl->info_lck);
}
/* Keep track of full_page_writes */
lastFullPageWrites = fpw;
}
2000-10-21 17:43:36 +02:00
}
2001-03-22 05:01:46 +01:00
/*
* Return the (possible) sync flag used for opening a file, depending on the
* value of the GUC wal_sync_method.
*/
static int
get_sync_bit(int method)
{
int o_direct_flag = 0;
/* If fsync is disabled, never open in sync mode */
if (!enableFsync)
return 0;
/*
* Optimize writes by bypassing kernel cache with O_DIRECT when using
* O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are
* disabled, otherwise the archive command or walsender process will read
* the WAL soon after writing it, which is guaranteed to cause a physical
* read if we bypassed the kernel cache. We also skip the
* posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
* reason.
*
* Never use O_DIRECT in walreceiver process for similar reasons; the WAL
* written by walreceiver is normally read by the startup process soon
* after it's written. Also, walreceiver performs unaligned writes, which
* don't work with O_DIRECT, so it is required for correctness too.
*/
Fix management of pendingOpsTable in auxiliary processes. mdinit() was misusing IsBootstrapProcessingMode() to decide whether to create an fsync pending-operations table in the current process. This led to creating a table not only in the startup and checkpointer processes as intended, but also in the bgwriter process, not to mention other auxiliary processes such as walwriter and walreceiver. Creation of the table in the bgwriter is fatal, because it absorbs fsync requests that should have gone to the checkpointer; instead they just sit in bgwriter local memory and are never acted on. So writes performed by the bgwriter were not being fsync'd which could result in data loss after an OS crash. I think there is no live bug with respect to walwriter and walreceiver because those never perform any writes of shared buffers; but the potential is there for future breakage in those processes too. To fix, make AuxiliaryProcessMain() export the current process's AuxProcType as a global variable, and then make mdinit() test directly for the types of aux process that should have a pendingOpsTable. Having done that, we might as well also get rid of the random bool flags such as am_walreceiver that some of the aux processes had grown. (Note that we could not have fixed the bug by examining those variables in mdinit(), because it's called from BaseInit() which is run by AuxiliaryProcessMain() before entering any of the process-type-specific code.) Back-patch to 9.2, where the problem was introduced by the split-up of bgwriter and checkpointer processes. The bogus pendingOpsTable exists in walwriter and walreceiver processes in earlier branches, but absent any evidence that it causes actual problems there, I'll leave the older branches alone.
2012-07-18 21:28:10 +02:00
if (!XLogIsNeeded() && !AmWalReceiverProcess())
o_direct_flag = PG_O_DIRECT;
switch (method)
{
2008-05-12 10:35:05 +02:00
/*
* enum values for all sync options are defined even if they are
* not supported on the current platform. But if not, they are
* not included in the enum option array, and therefore will never
* be seen here.
2008-05-12 10:35:05 +02:00
*/
case SYNC_METHOD_FSYNC:
case SYNC_METHOD_FSYNC_WRITETHROUGH:
case SYNC_METHOD_FDATASYNC:
return 0;
#ifdef OPEN_SYNC_FLAG
2008-05-12 10:35:05 +02:00
case SYNC_METHOD_OPEN:
return OPEN_SYNC_FLAG | o_direct_flag;
#endif
#ifdef OPEN_DATASYNC_FLAG
2008-05-12 10:35:05 +02:00
case SYNC_METHOD_OPEN_DSYNC:
return OPEN_DATASYNC_FLAG | o_direct_flag;
#endif
2008-05-12 10:35:05 +02:00
default:
/* can't happen (unless we are out of sync with option array) */
elog(ERROR, "unrecognized wal_sync_method: %d", method);
return 0; /* silence warning */
2008-05-12 10:35:05 +02:00
}
}
/*
* GUC support
*/
void
assign_xlog_sync_method(int new_sync_method, void *extra)
{
if (sync_method != new_sync_method)
{
/*
* To ensure that no blocks escape unsynced, force an fsync on the
* currently open log segment (if any). Also, if the open flag is
* changing, close the log file so it will be reopened (with new flag
* bit) at next use.
*/
if (openLogFile >= 0)
{
pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
if (pg_fsync(openLogFile) != 0)
{
char xlogfname[MAXFNAMELEN];
int save_errno;
save_errno = errno;
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
XLogFileName(xlogfname, openLogTLI, openLogSegNo,
wal_segment_size);
errno = save_errno;
ereport(PANIC,
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", xlogfname)));
}
pgstat_report_wait_end();
if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
XLogFileClose();
}
}
}
/*
* Issue appropriate kind of fsync (if any) for an XLOG output file.
*
* 'fd' is a file descriptor for the XLOG file to be fsync'd.
* 'segno' is for error reporting purposes.
*/
void
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
{
char *msg = NULL;
instr_time start;
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
Assert(tli != 0);
/*
* Quick exit if fsync is disabled or write() has already synced the WAL
* file.
*/
if (!enableFsync ||
sync_method == SYNC_METHOD_OPEN ||
sync_method == SYNC_METHOD_OPEN_DSYNC)
return;
/* Measure I/O timing to sync the WAL file */
if (track_wal_io_timing)
INSTR_TIME_SET_CURRENT(start);
pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
switch (sync_method)
{
case SYNC_METHOD_FSYNC:
if (pg_fsync_no_writethrough(fd) != 0)
msg = _("could not fsync file \"%s\": %m");
break;
#ifdef HAVE_FSYNC_WRITETHROUGH
case SYNC_METHOD_FSYNC_WRITETHROUGH:
if (pg_fsync_writethrough(fd) != 0)
msg = _("could not fsync write-through file \"%s\": %m");
break;
#endif
#ifdef HAVE_FDATASYNC
case SYNC_METHOD_FDATASYNC:
if (pg_fdatasync(fd) != 0)
msg = _("could not fdatasync file \"%s\": %m");
break;
#endif
case SYNC_METHOD_OPEN:
case SYNC_METHOD_OPEN_DSYNC:
/* not reachable */
Assert(false);
break;
default:
elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
break;
}
/* PANIC if failed to fsync */
if (msg)
{
char xlogfname[MAXFNAMELEN];
int save_errno = errno;
Remove all use of ThisTimeLineID global variable outside of xlog.c All such code deals with this global variable in one of three ways. Sometimes the same functions use it in more than one of these ways at the same time. First, sometimes it's an implicit argument to one or more functions being called in xlog.c or elsewhere, and must be set to the appropriate value before calling those functions lest they misbehave. In those cases, it is now passed as an explicit argument instead. Second, sometimes it's used to obtain the current timeline after the end of recovery, i.e. the timeline to which WAL is being written and flushed. Such code now calls GetWALInsertionTimeLine() or relies on the new out parameter added to GetFlushRecPtr(). Third, sometimes it's used during recovery to store the current replay timeline. That can change, so such code must generally update the value before each use. It can still do that, but must now use a local variable instead. The net effect of these changes is to reduce by a fair amount the amount of code that is directly accessing this global variable. That's good, because history has shown that we don't always think clearly about which timeline ID it's supposed to contain at any given point in time, or indeed, whether it has been or needs to be initialized at any given point in the code. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:50:01 +01:00
XLogFileName(xlogfname, tli, segno, wal_segment_size);
errno = save_errno;
ereport(PANIC,
(errcode_for_file_access(),
errmsg(msg, xlogfname)));
}
pgstat_report_wait_end();
/*
* Increment the I/O timing and the number of times WAL files were synced.
*/
if (track_wal_io_timing)
{
instr_time duration;
INSTR_TIME_SET_CURRENT(duration);
INSTR_TIME_SUBTRACT(duration, start);
WalStats.m_wal_sync_time += INSTR_TIME_GET_MICROSEC(duration);
}
WalStats.m_wal_sync++;
}
/*
* do_pg_start_backup
*
* Utility function called at the start of an online backup. It creates the
* necessary starting checkpoint and constructs the backup label file.
*
* There are two kind of backups: exclusive and non-exclusive. An exclusive
* backup is started with pg_start_backup(), and there can be only one active
* at a time. The backup and tablespace map files of an exclusive backup are
* written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
* removed by pg_stop_backup().
*
* A non-exclusive backup is used for the streaming base backups (see
* src/backend/replication/basebackup.c). The difference to exclusive backups
* is that the backup label and tablespace map files are not written to disk.
* Instead, their would-be contents are returned in *labelfile and *tblspcmapfile,
* and the caller is responsible for including them in the backup archive as
* 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
* active at the same time, and they don't conflict with an exclusive backup
* either.
*
Code review for server's handling of "tablespace map" files. While looking at Robert Foggia's report, I noticed a passel of other issues in the same area: * The scheme for backslash-quoting newlines in pathnames is just wrong; it will misbehave if the last ordinary character in a pathname is a backslash. I'm not sure why we're bothering to allow newlines in tablespace paths, but if we're going to do it we should do it without introducing other problems. Hence, backslashes themselves have to be backslashed too. * The author hadn't read the sscanf man page very carefully, because this code would drop any leading whitespace from the path. (I doubt that a tablespace path with leading whitespace could happen in practice; but if we're bothering to allow newlines in the path, it sure seems like leading whitespace is little less implausible.) Using sscanf for the task of finding the first space is overkill anyway. * While I'm not 100% sure what the rationale for escaping both \r and \n is, if the idea is to allow Windows newlines in the file then this code failed, because it'd throw an error if it saw \r followed by \n. * There's no cross-check for an incomplete final line in the map file, which would be a likely apparent symptom of the improper-escaping bug. On the generation end, aside from the escaping issue we have: * If needtblspcmapfile is true then do_pg_start_backup will pass back escaped strings in tablespaceinfo->path values, which no caller wants or is prepared to deal with. I'm not sure if there's a live bug from that, but it looks like there might be (given the dubious assumption that anyone actually has newlines in their tablespace paths). * It's not being very paranoid about the possibility of random stuff in the pg_tblspc directory. IMO we should ignore anything without an OID-like name. The escaping rule change doesn't seem back-patchable: it'll require doubling of backslashes in the tablespace_map file, which is basically a basebackup format change. The odds of that causing trouble are considerably more than the odds of the existing bug causing trouble. The rest of this seems somewhat unlikely to cause problems too, so no back-patch.
2021-03-17 21:18:46 +01:00
* labelfile and tblspcmapfile must be passed as NULL when starting an
* exclusive backup, and as initially-empty StringInfos for a non-exclusive
* backup.
*
* If "tablespaces" isn't NULL, it receives a list of tablespaceinfo structs
* describing the cluster's tablespaces.
*
* tblspcmapfile is required mainly for tar format in windows as native windows
* utilities are not able to create symlinks while extracting files from tar.
* However for consistency, the same is used for all platforms.
*
* Returns the minimum WAL location that must be present to restore from this
* backup, and the corresponding timeline ID in *starttli_p.
*
* Every successfully started non-exclusive backup must be stopped by calling
* do_pg_stop_backup() or do_pg_abort_backup().
*
* It is the responsibility of the caller of this function to verify the
* permissions of the calling user!
*/
XLogRecPtr
do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
StringInfo labelfile, List **tablespaces,
Code review for server's handling of "tablespace map" files. While looking at Robert Foggia's report, I noticed a passel of other issues in the same area: * The scheme for backslash-quoting newlines in pathnames is just wrong; it will misbehave if the last ordinary character in a pathname is a backslash. I'm not sure why we're bothering to allow newlines in tablespace paths, but if we're going to do it we should do it without introducing other problems. Hence, backslashes themselves have to be backslashed too. * The author hadn't read the sscanf man page very carefully, because this code would drop any leading whitespace from the path. (I doubt that a tablespace path with leading whitespace could happen in practice; but if we're bothering to allow newlines in the path, it sure seems like leading whitespace is little less implausible.) Using sscanf for the task of finding the first space is overkill anyway. * While I'm not 100% sure what the rationale for escaping both \r and \n is, if the idea is to allow Windows newlines in the file then this code failed, because it'd throw an error if it saw \r followed by \n. * There's no cross-check for an incomplete final line in the map file, which would be a likely apparent symptom of the improper-escaping bug. On the generation end, aside from the escaping issue we have: * If needtblspcmapfile is true then do_pg_start_backup will pass back escaped strings in tablespaceinfo->path values, which no caller wants or is prepared to deal with. I'm not sure if there's a live bug from that, but it looks like there might be (given the dubious assumption that anyone actually has newlines in their tablespace paths). * It's not being very paranoid about the possibility of random stuff in the pg_tblspc directory. IMO we should ignore anything without an OID-like name. The escaping rule change doesn't seem back-patchable: it'll require doubling of backslashes in the tablespace_map file, which is basically a basebackup format change. The odds of that causing trouble are considerably more than the odds of the existing bug causing trouble. The rest of this seems somewhat unlikely to cause problems too, so no back-patch.
2021-03-17 21:18:46 +01:00
StringInfo tblspcmapfile)
{
bool exclusive = (labelfile == NULL);
bool backup_started_in_recovery = false;
XLogRecPtr checkpointloc;
XLogRecPtr startpoint;
TimeLineID starttli;
pg_time_t stamp_time;
char strfbuf[128];
char xlogfilename[MAXFNAMELEN];
XLogSegNo _logSegNo;
struct stat stat_buf;
FILE *fp;
backup_started_in_recovery = RecoveryInProgress();
/*
* Currently only non-exclusive backup can be taken during recovery.
*/
if (backup_started_in_recovery && exclusive)
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
/*
* During recovery, we don't need to check WAL level. Because, if WAL
* level is not sufficient, it's impossible to get here during recovery.
*/
if (!backup_started_in_recovery && !XLogIsNeeded())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL level not sufficient for making an online backup"),
errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
if (strlen(backupidstr) > MAXPGPATH)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("backup label too long (max %d bytes)",
MAXPGPATH)));
/*
* Mark backup active in shared memory. We must do full-page WAL writes
* during an on-line backup even if not doing so at other times, because
* it's quite possible for the backup dump to obtain a "torn" (partially
* written) copy of a database page if it reads the page concurrently with
* our write to the same page. This can be fixed as long as the first
* write to the page in the WAL sequence is a full-page write. Hence, we
* turn on forcePageWrites and then force a CHECKPOINT, to ensure there
* are no dirty pages in shared memory that might get dumped while the
* backup is in progress without having a corresponding WAL record. (Once
* the backup is complete, we need not force full-page writes anymore,
* since we expect that any pages not modified during the backup interval
* must have been correctly captured by the backup.)
*
* Note that forcePageWrites has no effect during an online backup from
* the standby.
*
* We must hold all the insertion locks to change the value of
* forcePageWrites, to ensure adequate interlocking against
* XLogInsertRecord().
*/
WALInsertLockAcquireExclusive();
if (exclusive)
{
/*
* At first, mark that we're now starting an exclusive backup, to
* ensure that there are no other sessions currently running
* pg_start_backup() or pg_stop_backup().
*/
if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
{
WALInsertLockRelease();
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("a backup is already in progress"),
errhint("Run pg_stop_backup() and try again.")));
}
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
}
else
XLogCtl->Insert.nonExclusiveBackups++;
XLogCtl->Insert.forcePageWrites = true;
WALInsertLockRelease();
2004-08-29 07:07:03 +02:00
/* Ensure we release forcePageWrites if fail below */
PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
{
bool gotUniqueStartpoint = false;
DIR *tblspcdir;
struct dirent *de;
tablespaceinfo *ti;
int datadirpathlen;
2011-04-10 17:42:00 +02:00
/*
* Force an XLOG file switch before the checkpoint, to ensure that the
* WAL segment the checkpoint is written to doesn't contain pages with
* old timeline IDs. That would otherwise happen if you called
* pg_start_backup() right after restoring from a PITR archive: the
* first WAL segment containing the startup checkpoint has pages in
* the beginning with the old timeline ID. That can cause trouble at
* recovery: we won't have a history file covering the old timeline if
* pg_wal directory was not included in the base backup and the WAL
* archive was cleared too before starting the backup.
*
* This also ensures that we have emitted a WAL page header that has
* XLP_BKP_REMOVABLE off before we emit the checkpoint record.
* Therefore, if a WAL archiver (such as pglesslog) is trying to
* compress out removable backup blocks, it won't remove any that
* occur after this point.
*
* During recovery, we skip forcing XLOG file switch, which means that
* the backup taken during recovery is not available for the special
* recovery case described above.
*/
if (!backup_started_in_recovery)
RequestXLogSwitch(false);
do
{
bool checkpointfpw;
/*
* Force a CHECKPOINT. Aside from being necessary to prevent torn
* page problems, this guarantees that two successive backup runs
* will have different checkpoint positions and hence different
* history file names, even if nothing happened in between.
*
* During recovery, establish a restartpoint if possible. We use
* the last restartpoint as the backup starting checkpoint. This
* means that two successive backup runs can have same checkpoint
* positions.
*
* Since the fact that we are executing do_pg_start_backup()
* during recovery means that checkpointer is running, we can use
* RequestCheckpoint() to establish a restartpoint.
*
* We use CHECKPOINT_IMMEDIATE only if requested by user (via
* passing fast = true). Otherwise this can take awhile.
*/
RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
(fast ? CHECKPOINT_IMMEDIATE : 0));
/*
* Now we need to fetch the checkpoint record location, and also
* its REDO pointer. The oldest point in WAL that would be needed
* to restore starting from the checkpoint is precisely the REDO
* pointer.
*/
LWLockAcquire(ControlFileLock, LW_SHARED);
checkpointloc = ControlFile->checkPoint;
startpoint = ControlFile->checkPointCopy.redo;
starttli = ControlFile->checkPointCopy.ThisTimeLineID;
checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
LWLockRelease(ControlFileLock);
if (backup_started_in_recovery)
{
XLogRecPtr recptr;
/*
* Check to see if all WAL replayed during online backup
* (i.e., since last restartpoint used as backup starting
* checkpoint) contain full-page writes.
*/
SpinLockAcquire(&XLogCtl->info_lck);
recptr = XLogCtl->lastFpwDisableRecPtr;
SpinLockRelease(&XLogCtl->info_lck);
if (!checkpointfpw || startpoint <= recptr)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL generated with full_page_writes=off was replayed "
"since last restartpoint"),
errhint("This means that the backup being taken on the standby "
"is corrupt and should not be used. "
"Enable full_page_writes and run CHECKPOINT on the primary, "
"and then try an online backup again.")));
/*
* During recovery, since we don't use the end-of-backup WAL
* record and don't write the backup history file, the
* starting WAL location doesn't need to be unique. This means
* that two base backups started at the same time might use
* the same checkpoint as starting locations.
*/
gotUniqueStartpoint = true;
}
/*
* If two base backups are started at the same time (in WAL sender
* processes), we need to make sure that they use different
* checkpoints as starting locations, because we use the starting
* WAL location as a unique identifier for the base backup in the
* end-of-backup WAL record and when we write the backup history
* file. Perhaps it would be better generate a separate unique ID
* for each backup instead of forcing another checkpoint, but
* taking a checkpoint right after another is not that expensive
* either because only few buffers have been dirtied yet.
*/
WALInsertLockAcquireExclusive();
if (XLogCtl->Insert.lastBackupStart < startpoint)
{
XLogCtl->Insert.lastBackupStart = startpoint;
gotUniqueStartpoint = true;
}
WALInsertLockRelease();
} while (!gotUniqueStartpoint);
2004-08-29 07:07:03 +02:00
XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size);
2004-08-29 07:07:03 +02:00
/*
Code review for server's handling of "tablespace map" files. While looking at Robert Foggia's report, I noticed a passel of other issues in the same area: * The scheme for backslash-quoting newlines in pathnames is just wrong; it will misbehave if the last ordinary character in a pathname is a backslash. I'm not sure why we're bothering to allow newlines in tablespace paths, but if we're going to do it we should do it without introducing other problems. Hence, backslashes themselves have to be backslashed too. * The author hadn't read the sscanf man page very carefully, because this code would drop any leading whitespace from the path. (I doubt that a tablespace path with leading whitespace could happen in practice; but if we're bothering to allow newlines in the path, it sure seems like leading whitespace is little less implausible.) Using sscanf for the task of finding the first space is overkill anyway. * While I'm not 100% sure what the rationale for escaping both \r and \n is, if the idea is to allow Windows newlines in the file then this code failed, because it'd throw an error if it saw \r followed by \n. * There's no cross-check for an incomplete final line in the map file, which would be a likely apparent symptom of the improper-escaping bug. On the generation end, aside from the escaping issue we have: * If needtblspcmapfile is true then do_pg_start_backup will pass back escaped strings in tablespaceinfo->path values, which no caller wants or is prepared to deal with. I'm not sure if there's a live bug from that, but it looks like there might be (given the dubious assumption that anyone actually has newlines in their tablespace paths). * It's not being very paranoid about the possibility of random stuff in the pg_tblspc directory. IMO we should ignore anything without an OID-like name. The escaping rule change doesn't seem back-patchable: it'll require doubling of backslashes in the tablespace_map file, which is basically a basebackup format change. The odds of that causing trouble are considerably more than the odds of the existing bug causing trouble. The rest of this seems somewhat unlikely to cause problems too, so no back-patch.
2021-03-17 21:18:46 +01:00
* Construct tablespace_map file. If caller isn't interested in this,
* we make a local StringInfo.
*/
Code review for server's handling of "tablespace map" files. While looking at Robert Foggia's report, I noticed a passel of other issues in the same area: * The scheme for backslash-quoting newlines in pathnames is just wrong; it will misbehave if the last ordinary character in a pathname is a backslash. I'm not sure why we're bothering to allow newlines in tablespace paths, but if we're going to do it we should do it without introducing other problems. Hence, backslashes themselves have to be backslashed too. * The author hadn't read the sscanf man page very carefully, because this code would drop any leading whitespace from the path. (I doubt that a tablespace path with leading whitespace could happen in practice; but if we're bothering to allow newlines in the path, it sure seems like leading whitespace is little less implausible.) Using sscanf for the task of finding the first space is overkill anyway. * While I'm not 100% sure what the rationale for escaping both \r and \n is, if the idea is to allow Windows newlines in the file then this code failed, because it'd throw an error if it saw \r followed by \n. * There's no cross-check for an incomplete final line in the map file, which would be a likely apparent symptom of the improper-escaping bug. On the generation end, aside from the escaping issue we have: * If needtblspcmapfile is true then do_pg_start_backup will pass back escaped strings in tablespaceinfo->path values, which no caller wants or is prepared to deal with. I'm not sure if there's a live bug from that, but it looks like there might be (given the dubious assumption that anyone actually has newlines in their tablespace paths). * It's not being very paranoid about the possibility of random stuff in the pg_tblspc directory. IMO we should ignore anything without an OID-like name. The escaping rule change doesn't seem back-patchable: it'll require doubling of backslashes in the tablespace_map file, which is basically a basebackup format change. The odds of that causing trouble are considerably more than the odds of the existing bug causing trouble. The rest of this seems somewhat unlikely to cause problems too, so no back-patch.
2021-03-17 21:18:46 +01:00
if (tblspcmapfile == NULL)
tblspcmapfile = makeStringInfo();
datadirpathlen = strlen(DataDir);
/* Collect information about all tablespaces */
tblspcdir = AllocateDir("pg_tblspc");
while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
{
char fullpath[MAXPGPATH + 10];
char linkpath[MAXPGPATH];
char *relpath = NULL;
int rllen;
Code review for server's handling of "tablespace map" files. While looking at Robert Foggia's report, I noticed a passel of other issues in the same area: * The scheme for backslash-quoting newlines in pathnames is just wrong; it will misbehave if the last ordinary character in a pathname is a backslash. I'm not sure why we're bothering to allow newlines in tablespace paths, but if we're going to do it we should do it without introducing other problems. Hence, backslashes themselves have to be backslashed too. * The author hadn't read the sscanf man page very carefully, because this code would drop any leading whitespace from the path. (I doubt that a tablespace path with leading whitespace could happen in practice; but if we're bothering to allow newlines in the path, it sure seems like leading whitespace is little less implausible.) Using sscanf for the task of finding the first space is overkill anyway. * While I'm not 100% sure what the rationale for escaping both \r and \n is, if the idea is to allow Windows newlines in the file then this code failed, because it'd throw an error if it saw \r followed by \n. * There's no cross-check for an incomplete final line in the map file, which would be a likely apparent symptom of the improper-escaping bug. On the generation end, aside from the escaping issue we have: * If needtblspcmapfile is true then do_pg_start_backup will pass back escaped strings in tablespaceinfo->path values, which no caller wants or is prepared to deal with. I'm not sure if there's a live bug from that, but it looks like there might be (given the dubious assumption that anyone actually has newlines in their tablespace paths). * It's not being very paranoid about the possibility of random stuff in the pg_tblspc directory. IMO we should ignore anything without an OID-like name. The escaping rule change doesn't seem back-patchable: it'll require doubling of backslashes in the tablespace_map file, which is basically a basebackup format change. The odds of that causing trouble are considerably more than the odds of the existing bug causing trouble. The rest of this seems somewhat unlikely to cause problems too, so no back-patch.
2021-03-17 21:18:46 +01:00
StringInfoData escapedpath;
char *s;
Code review for server's handling of "tablespace map" files. While looking at Robert Foggia's report, I noticed a passel of other issues in the same area: * The scheme for backslash-quoting newlines in pathnames is just wrong; it will misbehave if the last ordinary character in a pathname is a backslash. I'm not sure why we're bothering to allow newlines in tablespace paths, but if we're going to do it we should do it without introducing other problems. Hence, backslashes themselves have to be backslashed too. * The author hadn't read the sscanf man page very carefully, because this code would drop any leading whitespace from the path. (I doubt that a tablespace path with leading whitespace could happen in practice; but if we're bothering to allow newlines in the path, it sure seems like leading whitespace is little less implausible.) Using sscanf for the task of finding the first space is overkill anyway. * While I'm not 100% sure what the rationale for escaping both \r and \n is, if the idea is to allow Windows newlines in the file then this code failed, because it'd throw an error if it saw \r followed by \n. * There's no cross-check for an incomplete final line in the map file, which would be a likely apparent symptom of the improper-escaping bug. On the generation end, aside from the escaping issue we have: * If needtblspcmapfile is true then do_pg_start_backup will pass back escaped strings in tablespaceinfo->path values, which no caller wants or is prepared to deal with. I'm not sure if there's a live bug from that, but it looks like there might be (given the dubious assumption that anyone actually has newlines in their tablespace paths). * It's not being very paranoid about the possibility of random stuff in the pg_tblspc directory. IMO we should ignore anything without an OID-like name. The escaping rule change doesn't seem back-patchable: it'll require doubling of backslashes in the tablespace_map file, which is basically a basebackup format change. The odds of that causing trouble are considerably more than the odds of the existing bug causing trouble. The rest of this seems somewhat unlikely to cause problems too, so no back-patch.
2021-03-17 21:18:46 +01:00
/* Skip anything that doesn't look like a tablespace */
if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
continue;
snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
/*
* Skip anything that isn't a symlink/junction. For testing only,
* we sometimes use allow_in_place_tablespaces to create
* directories directly under pg_tblspc, which would fail below.
*/
#ifdef WIN32
if (!pgwin32_is_junction(fullpath))
continue;
#else
if (get_dirent_type(fullpath, de, false, ERROR) != PGFILETYPE_LNK)
continue;
#endif
#if defined(HAVE_READLINK) || defined(WIN32)
rllen = readlink(fullpath, linkpath, sizeof(linkpath));
if (rllen < 0)
{
ereport(WARNING,
(errmsg("could not read symbolic link \"%s\": %m",
fullpath)));
continue;
}
else if (rllen >= sizeof(linkpath))
{
ereport(WARNING,
(errmsg("symbolic link \"%s\" target is too long",
fullpath)));
continue;
}
linkpath[rllen] = '\0';
/*
Code review for server's handling of "tablespace map" files. While looking at Robert Foggia's report, I noticed a passel of other issues in the same area: * The scheme for backslash-quoting newlines in pathnames is just wrong; it will misbehave if the last ordinary character in a pathname is a backslash. I'm not sure why we're bothering to allow newlines in tablespace paths, but if we're going to do it we should do it without introducing other problems. Hence, backslashes themselves have to be backslashed too. * The author hadn't read the sscanf man page very carefully, because this code would drop any leading whitespace from the path. (I doubt that a tablespace path with leading whitespace could happen in practice; but if we're bothering to allow newlines in the path, it sure seems like leading whitespace is little less implausible.) Using sscanf for the task of finding the first space is overkill anyway. * While I'm not 100% sure what the rationale for escaping both \r and \n is, if the idea is to allow Windows newlines in the file then this code failed, because it'd throw an error if it saw \r followed by \n. * There's no cross-check for an incomplete final line in the map file, which would be a likely apparent symptom of the improper-escaping bug. On the generation end, aside from the escaping issue we have: * If needtblspcmapfile is true then do_pg_start_backup will pass back escaped strings in tablespaceinfo->path values, which no caller wants or is prepared to deal with. I'm not sure if there's a live bug from that, but it looks like there might be (given the dubious assumption that anyone actually has newlines in their tablespace paths). * It's not being very paranoid about the possibility of random stuff in the pg_tblspc directory. IMO we should ignore anything without an OID-like name. The escaping rule change doesn't seem back-patchable: it'll require doubling of backslashes in the tablespace_map file, which is basically a basebackup format change. The odds of that causing trouble are considerably more than the odds of the existing bug causing trouble. The rest of this seems somewhat unlikely to cause problems too, so no back-patch.
2021-03-17 21:18:46 +01:00
* Build a backslash-escaped version of the link path to include
* in the tablespace map file.
*/
Code review for server's handling of "tablespace map" files. While looking at Robert Foggia's report, I noticed a passel of other issues in the same area: * The scheme for backslash-quoting newlines in pathnames is just wrong; it will misbehave if the last ordinary character in a pathname is a backslash. I'm not sure why we're bothering to allow newlines in tablespace paths, but if we're going to do it we should do it without introducing other problems. Hence, backslashes themselves have to be backslashed too. * The author hadn't read the sscanf man page very carefully, because this code would drop any leading whitespace from the path. (I doubt that a tablespace path with leading whitespace could happen in practice; but if we're bothering to allow newlines in the path, it sure seems like leading whitespace is little less implausible.) Using sscanf for the task of finding the first space is overkill anyway. * While I'm not 100% sure what the rationale for escaping both \r and \n is, if the idea is to allow Windows newlines in the file then this code failed, because it'd throw an error if it saw \r followed by \n. * There's no cross-check for an incomplete final line in the map file, which would be a likely apparent symptom of the improper-escaping bug. On the generation end, aside from the escaping issue we have: * If needtblspcmapfile is true then do_pg_start_backup will pass back escaped strings in tablespaceinfo->path values, which no caller wants or is prepared to deal with. I'm not sure if there's a live bug from that, but it looks like there might be (given the dubious assumption that anyone actually has newlines in their tablespace paths). * It's not being very paranoid about the possibility of random stuff in the pg_tblspc directory. IMO we should ignore anything without an OID-like name. The escaping rule change doesn't seem back-patchable: it'll require doubling of backslashes in the tablespace_map file, which is basically a basebackup format change. The odds of that causing trouble are considerably more than the odds of the existing bug causing trouble. The rest of this seems somewhat unlikely to cause problems too, so no back-patch.
2021-03-17 21:18:46 +01:00
initStringInfo(&escapedpath);
for (s = linkpath; *s; s++)
{
Code review for server's handling of "tablespace map" files. While looking at Robert Foggia's report, I noticed a passel of other issues in the same area: * The scheme for backslash-quoting newlines in pathnames is just wrong; it will misbehave if the last ordinary character in a pathname is a backslash. I'm not sure why we're bothering to allow newlines in tablespace paths, but if we're going to do it we should do it without introducing other problems. Hence, backslashes themselves have to be backslashed too. * The author hadn't read the sscanf man page very carefully, because this code would drop any leading whitespace from the path. (I doubt that a tablespace path with leading whitespace could happen in practice; but if we're bothering to allow newlines in the path, it sure seems like leading whitespace is little less implausible.) Using sscanf for the task of finding the first space is overkill anyway. * While I'm not 100% sure what the rationale for escaping both \r and \n is, if the idea is to allow Windows newlines in the file then this code failed, because it'd throw an error if it saw \r followed by \n. * There's no cross-check for an incomplete final line in the map file, which would be a likely apparent symptom of the improper-escaping bug. On the generation end, aside from the escaping issue we have: * If needtblspcmapfile is true then do_pg_start_backup will pass back escaped strings in tablespaceinfo->path values, which no caller wants or is prepared to deal with. I'm not sure if there's a live bug from that, but it looks like there might be (given the dubious assumption that anyone actually has newlines in their tablespace paths). * It's not being very paranoid about the possibility of random stuff in the pg_tblspc directory. IMO we should ignore anything without an OID-like name. The escaping rule change doesn't seem back-patchable: it'll require doubling of backslashes in the tablespace_map file, which is basically a basebackup format change. The odds of that causing trouble are considerably more than the odds of the existing bug causing trouble. The rest of this seems somewhat unlikely to cause problems too, so no back-patch.
2021-03-17 21:18:46 +01:00
if (*s == '\n' || *s == '\r' || *s == '\\')
appendStringInfoChar(&escapedpath, '\\');
appendStringInfoChar(&escapedpath, *s);
}
/*
* Relpath holds the relative path of the tablespace directory
* when it's located within PGDATA, or NULL if it's located
* elsewhere.
*/
if (rllen > datadirpathlen &&
strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
IS_DIR_SEP(linkpath[datadirpathlen]))
relpath = linkpath + datadirpathlen + 1;
ti = palloc(sizeof(tablespaceinfo));
ti->oid = pstrdup(de->d_name);
Code review for server's handling of "tablespace map" files. While looking at Robert Foggia's report, I noticed a passel of other issues in the same area: * The scheme for backslash-quoting newlines in pathnames is just wrong; it will misbehave if the last ordinary character in a pathname is a backslash. I'm not sure why we're bothering to allow newlines in tablespace paths, but if we're going to do it we should do it without introducing other problems. Hence, backslashes themselves have to be backslashed too. * The author hadn't read the sscanf man page very carefully, because this code would drop any leading whitespace from the path. (I doubt that a tablespace path with leading whitespace could happen in practice; but if we're bothering to allow newlines in the path, it sure seems like leading whitespace is little less implausible.) Using sscanf for the task of finding the first space is overkill anyway. * While I'm not 100% sure what the rationale for escaping both \r and \n is, if the idea is to allow Windows newlines in the file then this code failed, because it'd throw an error if it saw \r followed by \n. * There's no cross-check for an incomplete final line in the map file, which would be a likely apparent symptom of the improper-escaping bug. On the generation end, aside from the escaping issue we have: * If needtblspcmapfile is true then do_pg_start_backup will pass back escaped strings in tablespaceinfo->path values, which no caller wants or is prepared to deal with. I'm not sure if there's a live bug from that, but it looks like there might be (given the dubious assumption that anyone actually has newlines in their tablespace paths). * It's not being very paranoid about the possibility of random stuff in the pg_tblspc directory. IMO we should ignore anything without an OID-like name. The escaping rule change doesn't seem back-patchable: it'll require doubling of backslashes in the tablespace_map file, which is basically a basebackup format change. The odds of that causing trouble are considerably more than the odds of the existing bug causing trouble. The rest of this seems somewhat unlikely to cause problems too, so no back-patch.
2021-03-17 21:18:46 +01:00
ti->path = pstrdup(linkpath);
ti->rpath = relpath ? pstrdup(relpath) : NULL;
ti->size = -1;
if (tablespaces)
*tablespaces = lappend(*tablespaces, ti);
Code review for server's handling of "tablespace map" files. While looking at Robert Foggia's report, I noticed a passel of other issues in the same area: * The scheme for backslash-quoting newlines in pathnames is just wrong; it will misbehave if the last ordinary character in a pathname is a backslash. I'm not sure why we're bothering to allow newlines in tablespace paths, but if we're going to do it we should do it without introducing other problems. Hence, backslashes themselves have to be backslashed too. * The author hadn't read the sscanf man page very carefully, because this code would drop any leading whitespace from the path. (I doubt that a tablespace path with leading whitespace could happen in practice; but if we're bothering to allow newlines in the path, it sure seems like leading whitespace is little less implausible.) Using sscanf for the task of finding the first space is overkill anyway. * While I'm not 100% sure what the rationale for escaping both \r and \n is, if the idea is to allow Windows newlines in the file then this code failed, because it'd throw an error if it saw \r followed by \n. * There's no cross-check for an incomplete final line in the map file, which would be a likely apparent symptom of the improper-escaping bug. On the generation end, aside from the escaping issue we have: * If needtblspcmapfile is true then do_pg_start_backup will pass back escaped strings in tablespaceinfo->path values, which no caller wants or is prepared to deal with. I'm not sure if there's a live bug from that, but it looks like there might be (given the dubious assumption that anyone actually has newlines in their tablespace paths). * It's not being very paranoid about the possibility of random stuff in the pg_tblspc directory. IMO we should ignore anything without an OID-like name. The escaping rule change doesn't seem back-patchable: it'll require doubling of backslashes in the tablespace_map file, which is basically a basebackup format change. The odds of that causing trouble are considerably more than the odds of the existing bug causing trouble. The rest of this seems somewhat unlikely to cause problems too, so no back-patch.
2021-03-17 21:18:46 +01:00
appendStringInfo(tblspcmapfile, "%s %s\n",
ti->oid, escapedpath.data);
Code review for server's handling of "tablespace map" files. While looking at Robert Foggia's report, I noticed a passel of other issues in the same area: * The scheme for backslash-quoting newlines in pathnames is just wrong; it will misbehave if the last ordinary character in a pathname is a backslash. I'm not sure why we're bothering to allow newlines in tablespace paths, but if we're going to do it we should do it without introducing other problems. Hence, backslashes themselves have to be backslashed too. * The author hadn't read the sscanf man page very carefully, because this code would drop any leading whitespace from the path. (I doubt that a tablespace path with leading whitespace could happen in practice; but if we're bothering to allow newlines in the path, it sure seems like leading whitespace is little less implausible.) Using sscanf for the task of finding the first space is overkill anyway. * While I'm not 100% sure what the rationale for escaping both \r and \n is, if the idea is to allow Windows newlines in the file then this code failed, because it'd throw an error if it saw \r followed by \n. * There's no cross-check for an incomplete final line in the map file, which would be a likely apparent symptom of the improper-escaping bug. On the generation end, aside from the escaping issue we have: * If needtblspcmapfile is true then do_pg_start_backup will pass back escaped strings in tablespaceinfo->path values, which no caller wants or is prepared to deal with. I'm not sure if there's a live bug from that, but it looks like there might be (given the dubious assumption that anyone actually has newlines in their tablespace paths). * It's not being very paranoid about the possibility of random stuff in the pg_tblspc directory. IMO we should ignore anything without an OID-like name. The escaping rule change doesn't seem back-patchable: it'll require doubling of backslashes in the tablespace_map file, which is basically a basebackup format change. The odds of that causing trouble are considerably more than the odds of the existing bug causing trouble. The rest of this seems somewhat unlikely to cause problems too, so no back-patch.
2021-03-17 21:18:46 +01:00
pfree(escapedpath.data);
#else
/*
* If the platform does not have symbolic links, it should not be
* possible to have tablespaces - clearly somebody else created
* them. Warn about it and ignore.
*/
ereport(WARNING,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("tablespaces are not supported on this platform")));
#endif
}
FreeDir(tblspcdir);
/*
Code review for server's handling of "tablespace map" files. While looking at Robert Foggia's report, I noticed a passel of other issues in the same area: * The scheme for backslash-quoting newlines in pathnames is just wrong; it will misbehave if the last ordinary character in a pathname is a backslash. I'm not sure why we're bothering to allow newlines in tablespace paths, but if we're going to do it we should do it without introducing other problems. Hence, backslashes themselves have to be backslashed too. * The author hadn't read the sscanf man page very carefully, because this code would drop any leading whitespace from the path. (I doubt that a tablespace path with leading whitespace could happen in practice; but if we're bothering to allow newlines in the path, it sure seems like leading whitespace is little less implausible.) Using sscanf for the task of finding the first space is overkill anyway. * While I'm not 100% sure what the rationale for escaping both \r and \n is, if the idea is to allow Windows newlines in the file then this code failed, because it'd throw an error if it saw \r followed by \n. * There's no cross-check for an incomplete final line in the map file, which would be a likely apparent symptom of the improper-escaping bug. On the generation end, aside from the escaping issue we have: * If needtblspcmapfile is true then do_pg_start_backup will pass back escaped strings in tablespaceinfo->path values, which no caller wants or is prepared to deal with. I'm not sure if there's a live bug from that, but it looks like there might be (given the dubious assumption that anyone actually has newlines in their tablespace paths). * It's not being very paranoid about the possibility of random stuff in the pg_tblspc directory. IMO we should ignore anything without an OID-like name. The escaping rule change doesn't seem back-patchable: it'll require doubling of backslashes in the tablespace_map file, which is basically a basebackup format change. The odds of that causing trouble are considerably more than the odds of the existing bug causing trouble. The rest of this seems somewhat unlikely to cause problems too, so no back-patch.
2021-03-17 21:18:46 +01:00
* Construct backup label file. If caller isn't interested in this,
* we make a local StringInfo.
*/
Code review for server's handling of "tablespace map" files. While looking at Robert Foggia's report, I noticed a passel of other issues in the same area: * The scheme for backslash-quoting newlines in pathnames is just wrong; it will misbehave if the last ordinary character in a pathname is a backslash. I'm not sure why we're bothering to allow newlines in tablespace paths, but if we're going to do it we should do it without introducing other problems. Hence, backslashes themselves have to be backslashed too. * The author hadn't read the sscanf man page very carefully, because this code would drop any leading whitespace from the path. (I doubt that a tablespace path with leading whitespace could happen in practice; but if we're bothering to allow newlines in the path, it sure seems like leading whitespace is little less implausible.) Using sscanf for the task of finding the first space is overkill anyway. * While I'm not 100% sure what the rationale for escaping both \r and \n is, if the idea is to allow Windows newlines in the file then this code failed, because it'd throw an error if it saw \r followed by \n. * There's no cross-check for an incomplete final line in the map file, which would be a likely apparent symptom of the improper-escaping bug. On the generation end, aside from the escaping issue we have: * If needtblspcmapfile is true then do_pg_start_backup will pass back escaped strings in tablespaceinfo->path values, which no caller wants or is prepared to deal with. I'm not sure if there's a live bug from that, but it looks like there might be (given the dubious assumption that anyone actually has newlines in their tablespace paths). * It's not being very paranoid about the possibility of random stuff in the pg_tblspc directory. IMO we should ignore anything without an OID-like name. The escaping rule change doesn't seem back-patchable: it'll require doubling of backslashes in the tablespace_map file, which is basically a basebackup format change. The odds of that causing trouble are considerably more than the odds of the existing bug causing trouble. The rest of this seems somewhat unlikely to cause problems too, so no back-patch.
2021-03-17 21:18:46 +01:00
if (labelfile == NULL)
labelfile = makeStringInfo();
/* Use the log timezone here, not the session timezone */
stamp_time = (pg_time_t) time(NULL);
pg_strftime(strfbuf, sizeof(strfbuf),
"%Y-%m-%d %H:%M:%S %Z",
pg_localtime(&stamp_time, log_timezone));
appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
LSN_FORMAT_ARGS(startpoint), xlogfilename);
appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
LSN_FORMAT_ARGS(checkpointloc));
appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
exclusive ? "pg_start_backup" : "streamed");
appendStringInfo(labelfile, "BACKUP FROM: %s\n",
backup_started_in_recovery ? "standby" : "primary");
appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli);
/*
* Okay, write the file, or return its contents to caller.
*/
if (exclusive)
{
/*
* Check for existing backup label --- implies a backup is already
* running. (XXX given that we checked exclusiveBackupState
* above, maybe it would be OK to just unlink any such label
* file?)
*/
if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
{
if (errno != ENOENT)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m",
BACKUP_LABEL_FILE)));
}
else
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("a backup is already in progress"),
errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
BACKUP_LABEL_FILE)));
fp = AllocateFile(BACKUP_LABEL_FILE, "w");
if (!fp)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not create file \"%s\": %m",
BACKUP_LABEL_FILE)));
if (fwrite(labelfile->data, labelfile->len, 1, fp) != 1 ||
fflush(fp) != 0 ||
pg_fsync(fileno(fp)) != 0 ||
ferror(fp) ||
FreeFile(fp))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not write file \"%s\": %m",
BACKUP_LABEL_FILE)));
/* Allocated locally for exclusive backups, so free separately */
pfree(labelfile->data);
pfree(labelfile);
/* Write backup tablespace_map file. */
if (tblspcmapfile->len > 0)
{
if (stat(TABLESPACE_MAP, &stat_buf) != 0)
{
if (errno != ENOENT)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m",
TABLESPACE_MAP)));
}
else
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("a backup is already in progress"),
errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
TABLESPACE_MAP)));
fp = AllocateFile(TABLESPACE_MAP, "w");
if (!fp)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not create file \"%s\": %m",
TABLESPACE_MAP)));
if (fwrite(tblspcmapfile->data, tblspcmapfile->len, 1, fp) != 1 ||
fflush(fp) != 0 ||
pg_fsync(fileno(fp)) != 0 ||
ferror(fp) ||
FreeFile(fp))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not write file \"%s\": %m",
TABLESPACE_MAP)));
}
/* Allocated locally for exclusive backups, so free separately */
pfree(tblspcmapfile->data);
pfree(tblspcmapfile);
}
}
PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
2004-08-29 07:07:03 +02:00
/*
* Mark that start phase has correctly finished for an exclusive backup.
* Session-level locks are updated as well to reflect that state.
*
* Note that CHECK_FOR_INTERRUPTS() must not occur while updating backup
* counters and session-level lock. Otherwise they can be updated
* inconsistently, and which might cause do_pg_abort_backup() to fail.
*/
if (exclusive)
{
WALInsertLockAcquireExclusive();
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
/* Set session-level lock */
sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
WALInsertLockRelease();
}
else
sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
/*
* We're done. As a convenience, return the starting WAL location.
*/
if (starttli_p)
*starttli_p = starttli;
return startpoint;
}
/* Error cleanup callback for pg_start_backup */
static void
pg_start_backup_callback(int code, Datum arg)
{
bool exclusive = DatumGetBool(arg);
/* Update backup counters and forcePageWrites on failure */
WALInsertLockAcquireExclusive();
if (exclusive)
{
Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
}
else
{
Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
XLogCtl->Insert.nonExclusiveBackups--;
}
if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
XLogCtl->Insert.nonExclusiveBackups == 0)
{
XLogCtl->Insert.forcePageWrites = false;
}
WALInsertLockRelease();
}
/*
* Error cleanup callback for pg_stop_backup
*/
static void
pg_stop_backup_callback(int code, Datum arg)
{
bool exclusive = DatumGetBool(arg);
/* Update backup status on failure */
WALInsertLockAcquireExclusive();
if (exclusive)
{
Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
}
WALInsertLockRelease();
}
/*
* Utility routine to fetch the session-level status of a backup running.
*/
SessionBackupState
get_backup_status(void)
{
return sessionBackupState;
}
/*
* do_pg_stop_backup
*
* Utility function called at the end of an online backup. It cleans up the
* backup state and can optionally wait for WAL segments to be archived.
2016-10-21 18:04:21 +02:00
*
* If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
* the non-exclusive backup specified by 'labelfile'.
*
* Returns the last WAL location that must be present to restore from this
* backup, and the corresponding timeline ID in *stoptli_p.
*
* It is the responsibility of the caller of this function to verify the
* permissions of the calling user!
*/
XLogRecPtr
do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
{
bool exclusive = (labelfile == NULL);
bool backup_started_in_recovery = false;
XLogRecPtr startpoint;
XLogRecPtr stoppoint;
TimeLineID stoptli;
pg_time_t stamp_time;
char strfbuf[128];
char histfilepath[MAXPGPATH];
char startxlogfilename[MAXFNAMELEN];
char stopxlogfilename[MAXFNAMELEN];
char lastxlogfilename[MAXFNAMELEN];
char histfilename[MAXFNAMELEN];
char backupfrom[20];
XLogSegNo _logSegNo;
FILE *lfp;
FILE *fp;
char ch;
int seconds_before_warning;
int waits = 0;
bool reported_waiting = false;
char *remaining;
char *ptr;
uint32 hi,
lo;
backup_started_in_recovery = RecoveryInProgress();
/*
* Currently only non-exclusive backup can be taken during recovery.
*/
if (backup_started_in_recovery && exclusive)
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
/*
* During recovery, we don't need to check WAL level. Because, if WAL
* level is not sufficient, it's impossible to get here during recovery.
*/
if (!backup_started_in_recovery && !XLogIsNeeded())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL level not sufficient for making an online backup"),
errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
if (exclusive)
{
/*
* At first, mark that we're now stopping an exclusive backup, to
* ensure that there are no other sessions currently running
* pg_start_backup() or pg_stop_backup().
*/
WALInsertLockAcquireExclusive();
if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
{
WALInsertLockRelease();
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("exclusive backup not in progress")));
}
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
WALInsertLockRelease();
/*
* Remove backup_label. In case of failure, the state for an exclusive
* backup is switched back to in-progress.
*/
PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
{
/*
* Read the existing label file into memory.
*/
struct stat statbuf;
int r;
if (stat(BACKUP_LABEL_FILE, &statbuf))
{
/* should not happen per the upper checks */
if (errno != ENOENT)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m",
BACKUP_LABEL_FILE)));
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("a backup is not in progress")));
}
lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
if (!lfp)
{
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m",
BACKUP_LABEL_FILE)));
}
labelfile = palloc(statbuf.st_size + 1);
r = fread(labelfile, statbuf.st_size, 1, lfp);
labelfile[statbuf.st_size] = '\0';
/*
* Close and remove the backup label file
*/
if (r != 1 || ferror(lfp) || FreeFile(lfp))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m",
BACKUP_LABEL_FILE)));
durable_unlink(BACKUP_LABEL_FILE, ERROR);
/*
* Remove tablespace_map file if present, it is created only if
* there are tablespaces.
*/
durable_unlink(TABLESPACE_MAP, DEBUG1);
}
PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
}
/*
* OK to update backup counters, forcePageWrites and session-level lock.
*
* Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
* Otherwise they can be updated inconsistently, and which might cause
* do_pg_abort_backup() to fail.
*/
WALInsertLockAcquireExclusive();
if (exclusive)
{
XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
}
else
{
/*
* The user-visible pg_start/stop_backup() functions that operate on
* exclusive backups can be called at any time, but for non-exclusive
* backups, it is expected that each do_pg_start_backup() call is
* matched by exactly one do_pg_stop_backup() call.
*/
Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
XLogCtl->Insert.nonExclusiveBackups--;
}
if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
XLogCtl->Insert.nonExclusiveBackups == 0)
{
XLogCtl->Insert.forcePageWrites = false;
}
/*
* Clean up session-level lock.
*
* You might think that WALInsertLockRelease() can be called before
* cleaning up session-level lock because session-level lock doesn't need
* to be protected with WAL insertion lock. But since
* CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
* cleaned up before it.
*/
sessionBackupState = SESSION_BACKUP_NONE;
WALInsertLockRelease();
/*
* Read and parse the START WAL LOCATION line (this code is pretty crude,
* but we are not expecting any variability in the file format).
*/
if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
&hi, &lo, startxlogfilename,
&ch) != 4 || ch != '\n')
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
startpoint = ((uint64) hi) << 32 | lo;
remaining = strchr(labelfile, '\n') + 1; /* %n is not portable enough */
2004-08-29 07:07:03 +02:00
/*
* Parse the BACKUP FROM line. If we are taking an online backup from the
* standby, we confirm that the standby has not been promoted during the
* backup.
*/
ptr = strstr(remaining, "BACKUP FROM:");
if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("the standby was promoted during online backup"),
errhint("This means that the backup being taken is corrupt "
"and should not be used. "
"Try taking another online backup.")));
/*
* During recovery, we don't write an end-of-backup record. We assume that
* pg_control was backed up last and its minimum recovery point can be
* available as the backup end location. Since we don't have an
* end-of-backup record, we use the pg_control value to check whether
* we've reached the end of backup when starting recovery from this
* backup. We have no way of checking if pg_control wasn't backed up last
* however.
*
* We don't force a switch to new WAL file but it is still possible to
* wait for all the required files to be archived if waitforarchive is
* true. This is okay if we use the backup to start a standby and fetch
* the missing WAL using streaming replication. But in the case of an
* archive recovery, a user should set waitforarchive to true and wait for
* them to be archived to ensure that all the required files are
* available.
*
* We return the current minimum recovery point as the backup end
* location. Note that it can be greater than the exact backup end
* location if the minimum recovery point is updated after the backup of
* pg_control. This is harmless for current uses.
*
* XXX currently a backup history file is for informational and debug
* purposes only. It's not essential for an online backup. Furthermore,
* even if it's created, it will not be archived during recovery because
* an archiver is not invoked. So it doesn't seem worthwhile to write a
* backup history file during recovery.
*/
if (backup_started_in_recovery)
{
XLogRecPtr recptr;
/*
* Check to see if all WAL replayed during online backup contain
* full-page writes.
*/
SpinLockAcquire(&XLogCtl->info_lck);
recptr = XLogCtl->lastFpwDisableRecPtr;
SpinLockRelease(&XLogCtl->info_lck);
if (startpoint <= recptr)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL generated with full_page_writes=off was replayed "
"during online backup"),
errhint("This means that the backup being taken on the standby "
"is corrupt and should not be used. "
"Enable full_page_writes and run CHECKPOINT on the primary, "
"and then try an online backup again.")));
LWLockAcquire(ControlFileLock, LW_SHARED);
stoppoint = ControlFile->minRecoveryPoint;
stoptli = ControlFile->minRecoveryPointTLI;
LWLockRelease(ControlFileLock);
}
else
{
/*
* Write the backup-end xlog record
*/
XLogBeginInsert();
XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
/*
* Given that we're not in recovery, InsertTimeLineID is set and can't
Change ThisTimeLineID from a global variable to a local variable. StartupXLOG() still has ThisTimeLineID as a local variable, but the remaining code in xlog.c now needs to the relevant TimeLineID by some other means. Mostly, this means that we now pass it as a function parameter to a bunch of functions where we didn't previously. However, a few cases require special handling: - In functions that might be called by outside callers who wouldn't necessarily know what timeline to specify, we get the timeline ID from shared memory. XLogCtl->ThisTimeLineID can be used in most cases since recovery is known to have completed by the time those functions are called. In xlog_redo(), we can use XLogCtl->replayEndTLI. - XLogFileClose() needs to know the TLI of the open logfile. Do that with a new global variable openLogTLI. While someone could argue that this is just trading one global variable for another, the new one has a far more narrow purposes and is referenced in just a few places. - read_backup_label() now returns the TLI that it obtains by parsing the backup_label file. Previously, ReadRecord() could be called to parse the checkpoint record without ThisTimeLineID having been initialized. Now, the timeline is passed down, and I didn't want to pass an uninitialized variable; this change lets us avoid that. The old coding didn't seem to have any practical consequences that we need to worry about, but this is cleaner. - In BootstrapXLOG(), it's just a constant. Patch by me, reviewed and tested by Michael Paquier, Amul Sul, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmobfAAqhfWa1kaFBBFvX+5CjM=7TE=n4r4Q1o2bjbGYBpA@mail.gmail.com
2021-11-05 17:53:15 +01:00
* change, so we can read it without a lock.
*/
stoptli = XLogCtl->InsertTimeLineID;
/*
* Force a switch to a new xlog segment file, so that the backup is
* valid as soon as archiver moves out the current segment file.
*/
RequestXLogSwitch(false);
XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size);
/* Use the log timezone here, not the session timezone */
stamp_time = (pg_time_t) time(NULL);
pg_strftime(strfbuf, sizeof(strfbuf),
"%Y-%m-%d %H:%M:%S %Z",
pg_localtime(&stamp_time, log_timezone));
/*
* Write the backup history file
*/
XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
BackupHistoryFilePath(histfilepath, stoptli, _logSegNo,
startpoint, wal_segment_size);
fp = AllocateFile(histfilepath, "w");
if (!fp)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not create file \"%s\": %m",
histfilepath)));
fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
LSN_FORMAT_ARGS(startpoint), startxlogfilename);
fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
LSN_FORMAT_ARGS(stoppoint), stopxlogfilename);
/*
* Transfer remaining lines including label and start timeline to
* history file.
*/
fprintf(fp, "%s", remaining);
fprintf(fp, "STOP TIME: %s\n", strfbuf);
fprintf(fp, "STOP TIMELINE: %u\n", stoptli);
if (fflush(fp) || ferror(fp) || FreeFile(fp))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not write file \"%s\": %m",
histfilepath)));
2004-08-29 07:07:03 +02:00
/*
* Clean out any no-longer-needed history files. As a side effect,
* this will post a .ready file for the newly created history file,
* notifying the archiver that history file may be archived
* immediately.
*/
CleanupBackupHistory();
}
2004-08-29 07:07:03 +02:00
/*
* If archiving is enabled, wait for all the required WAL files to be
* archived before returning. If archiving isn't enabled, the required WAL
* needs to be transported via streaming replication (hopefully with
* wal_keep_size set high enough), or some more exotic mechanism like
* polling and copying files from pg_wal with script. We have no knowledge
* of those mechanisms, so it's up to the user to ensure that he gets all
* the required WAL.
*
* We wait until both the last WAL file filled during backup and the
* history file have been archived, and assume that the alphabetic sorting
* property of the WAL files ensures any earlier WAL files are safely
* archived as well.
*
* We wait forever, since archive_command is supposed to work and we
* assume the admin wanted his backup to work completely. If you don't
* wish to wait, then either waitforarchive should be passed in as false,
* or you can set statement_timeout. Also, some notices are issued to
* clue in anyone who might be doing this interactively.
*/
if (waitforarchive &&
((!backup_started_in_recovery && XLogArchivingActive()) ||
(backup_started_in_recovery && XLogArchivingAlways())))
{
XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
XLogFileName(lastxlogfilename, stoptli, _logSegNo, wal_segment_size);
XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
BackupHistoryFileName(histfilename, stoptli, _logSegNo,
startpoint, wal_segment_size);
seconds_before_warning = 60;
waits = 0;
while (XLogArchiveIsBusy(lastxlogfilename) ||
XLogArchiveIsBusy(histfilename))
{
CHECK_FOR_INTERRUPTS();
if (!reported_waiting && waits > 5)
2010-07-06 21:19:02 +02:00
{
ereport(NOTICE,
(errmsg("base backup done, waiting for required WAL segments to be archived")));
reported_waiting = true;
2010-07-06 21:19:02 +02:00
}
(void) WaitLatch(MyLatch,
WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
1000L,
WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
ResetLatch(MyLatch);
2010-07-06 21:19:02 +02:00
if (++waits >= seconds_before_warning)
{
seconds_before_warning *= 2; /* This wraps in >10 years... */
ereport(WARNING,
(errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
waits),
2010-03-21 01:17:59 +01:00
errhint("Check that your archive_command is executing properly. "
"You can safely cancel this backup, "
"but the database backup will not be usable without all the WAL segments.")));
2010-07-06 21:19:02 +02:00
}
}
ereport(NOTICE,
(errmsg("all required WAL segments have been archived")));
}
else if (waitforarchive)
ereport(NOTICE,
(errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
/*
* We're done. As a convenience, return the ending WAL location.
*/
if (stoptli_p)
*stoptli_p = stoptli;
return stoppoint;
}
/*
* do_pg_abort_backup: abort a running backup
*
* This does just the most basic steps of do_pg_stop_backup(), by taking the
* system out of backup mode, thus making it a lot more safe to call from
* an error handler.
*
* The caller can pass 'arg' as 'true' or 'false' to control whether a warning
* is emitted.
*
* NB: This is only for aborting a non-exclusive backup that doesn't write
* backup_label. A backup started with pg_start_backup() needs to be finished
* with pg_stop_backup().
*
* NB: This gets used as a before_shmem_exit handler, hence the odd-looking
* signature.
*/
void
do_pg_abort_backup(int code, Datum arg)
{
bool emit_warning = DatumGetBool(arg);
/*
* Quick exit if session is not keeping around a non-exclusive backup
* already started.
*/
if (sessionBackupState != SESSION_BACKUP_NON_EXCLUSIVE)
return;
WALInsertLockAcquireExclusive();
Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
XLogCtl->Insert.nonExclusiveBackups--;
if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
XLogCtl->Insert.nonExclusiveBackups == 0)
{
XLogCtl->Insert.forcePageWrites = false;
}
WALInsertLockRelease();
if (emit_warning)
ereport(WARNING,
2020-04-26 13:48:33 +02:00
(errmsg("aborting backup due to backend exiting before pg_stop_backup was called")));
}
/*
* Register a handler that will warn about unterminated backups at end of
* session, unless this has already been done.
*/
void
register_persistent_abort_backup_handler(void)
{
static bool already_done = false;
if (already_done)
return;
before_shmem_exit(do_pg_abort_backup, DatumGetBool(true));
already_done = true;
}
/*
* Get latest WAL insert pointer
*/
XLogRecPtr
GetXLogInsertRecPtr(void)
{
XLogCtlInsert *Insert = &XLogCtl->Insert;
uint64 current_bytepos;
SpinLockAcquire(&Insert->insertpos_lck);
current_bytepos = Insert->CurrBytePos;
SpinLockRelease(&Insert->insertpos_lck);
return XLogBytePosToRecPtr(current_bytepos);
}
/*
* Get latest WAL write pointer
*/
XLogRecPtr
GetXLogWriteRecPtr(void)
{
SpinLockAcquire(&XLogCtl->info_lck);
LogwrtResult = XLogCtl->LogwrtResult;
SpinLockRelease(&XLogCtl->info_lck);
return LogwrtResult.Write;
}
/*
* Returns the redo pointer of the last checkpoint or restartpoint. This is
* the oldest point in WAL that we still need, if we have to restart recovery.
*/
void
GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
{
LWLockAcquire(ControlFileLock, LW_SHARED);
*oldrecptr = ControlFile->checkPointCopy.redo;
*oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
LWLockRelease(ControlFileLock);
}
/*
* BackupInProgress: check if online backup mode is active
*
* This is done by checking for existence of the "backup_label" file.
*/
bool
BackupInProgress(void)
{
struct stat stat_buf;
return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
}
/*
* CancelBackup: rename the "backup_label" and "tablespace_map"
* files to cancel backup mode
*
* If the "backup_label" file exists, it will be renamed to "backup_label.old".
* Similarly, if the "tablespace_map" file exists, it will be renamed to
* "tablespace_map.old".
*
* Note that this will render an online backup in progress
* useless. To correctly finish an online backup, pg_stop_backup must be
* called.
*/
void
CancelBackup(void)
{
struct stat stat_buf;
/* if the backup_label file is not there, return */
if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
return;
/* remove leftover file from previously canceled backup if it exists */
unlink(BACKUP_LABEL_OLD);
if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0)
{
ereport(WARNING,
(errcode_for_file_access(),
errmsg("online backup mode was not canceled"),
errdetail("File \"%s\" could not be renamed to \"%s\": %m.",
BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
return;
}
/* if the tablespace_map file is not there, return */
if (stat(TABLESPACE_MAP, &stat_buf) < 0)
{
ereport(LOG,
(errmsg("online backup mode canceled"),
errdetail("File \"%s\" was renamed to \"%s\".",
BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
return;
}
/* remove leftover file from previously canceled backup if it exists */
unlink(TABLESPACE_MAP_OLD);
if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
{
ereport(LOG,
(errmsg("online backup mode canceled"),
errdetail("Files \"%s\" and \"%s\" were renamed to "
"\"%s\" and \"%s\", respectively.",
BACKUP_LABEL_FILE, TABLESPACE_MAP,
BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
}
else
{
ereport(WARNING,
(errcode_for_file_access(),
errmsg("online backup mode canceled"),
errdetail("File \"%s\" was renamed to \"%s\", but "
"file \"%s\" could not be renamed to \"%s\": %m.",
BACKUP_LABEL_FILE, BACKUP_LABEL_OLD,
TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
}
}
/* Thin wrapper around ShutdownWalRcv(). */
void
XLogShutdownWalRcv(void)
{
ShutdownWalRcv();
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
XLogCtl->InstallXLogFileSegmentActive = false;
LWLockRelease(ControlFileLock);
}
/* Enable WAL file recycling and preallocation. */
void
SetInstallXLogFileSegmentActive(void)
{
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
XLogCtl->InstallXLogFileSegmentActive = true;
LWLockRelease(ControlFileLock);
}
bool
IsInstallXLogFileSegmentActive(void)
{
bool result;
LWLockAcquire(ControlFileLock, LW_SHARED);
result = XLogCtl->InstallXLogFileSegmentActive;
LWLockRelease(ControlFileLock);
return result;
}
/*
* Update the WalWriterSleeping flag.
*/
void
SetWalWriterSleeping(bool sleeping)
{
SpinLockAcquire(&XLogCtl->info_lck);
XLogCtl->WalWriterSleeping = sleeping;
SpinLockRelease(&XLogCtl->info_lck);
}