postgresql/src/include/catalog/pg_control.h
Andres Freund fc49e24fa6 Make WAL segment size configurable at initdb time.
For performance reasons a larger segment size than the default 16MB
can be useful. A larger segment size has two main benefits: Firstly,
in setups using archiving, it makes it easier to write scripts that
can keep up with higher amounts of WAL, secondly, the WAL has to be
written and synced to disk less frequently.

But at the same time large segment size are disadvantageous for
smaller databases. So far the segment size had to be configured at
compile time, often making it unrealistic to choose one fitting to a
particularly load. Therefore change it to a initdb time setting.

This includes a breaking changes to the xlogreader.h API, which now
requires the current segment size to be configured.  For that and
similar reasons a number of binaries had to be taught how to recognize
the current segment size.

Author: Beena Emerson, editorialized by Andres Freund
Reviewed-By: Andres Freund, David Steele, Kuntal Ghosh, Michael
    Paquier, Peter Eisentraut, Robert Hass, Tushar Ahuja
Discussion: https://postgr.es/m/CAOG9ApEAcQ--1ieKbhFzXSQPw_YLmepaa4hNdnY5+ZULpt81Mw@mail.gmail.com
2017-09-19 22:03:48 -07:00

253 lines
9.2 KiB
C

/*-------------------------------------------------------------------------
*
* pg_control.h
* The system control file "pg_control" is not a heap relation.
* However, we define it here so that the format is documented.
*
*
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/catalog/pg_control.h
*
*-------------------------------------------------------------------------
*/
#ifndef PG_CONTROL_H
#define PG_CONTROL_H
#include "access/xlogdefs.h"
#include "pgtime.h" /* for pg_time_t */
#include "port/pg_crc32c.h"
/* Version identifier for this pg_control format */
#define PG_CONTROL_VERSION 1003
/* Nonce key length, see below */
#define MOCK_AUTH_NONCE_LEN 32
/*
* Body of CheckPoint XLOG records. This is declared here because we keep
* a copy of the latest one in pg_control for possible disaster recovery.
* Changing this struct requires a PG_CONTROL_VERSION bump.
*/
typedef struct CheckPoint
{
XLogRecPtr redo; /* next RecPtr available when we began to
* create CheckPoint (i.e. REDO start point) */
TimeLineID ThisTimeLineID; /* current TLI */
TimeLineID PrevTimeLineID; /* previous TLI, if this record begins a new
* timeline (equals ThisTimeLineID otherwise) */
bool fullPageWrites; /* current full_page_writes */
uint32 nextXidEpoch; /* higher-order bits of nextXid */
TransactionId nextXid; /* next free XID */
Oid nextOid; /* next free OID */
MultiXactId nextMulti; /* next free MultiXactId */
MultiXactOffset nextMultiOffset; /* next free MultiXact offset */
TransactionId oldestXid; /* cluster-wide minimum datfrozenxid */
Oid oldestXidDB; /* database with minimum datfrozenxid */
MultiXactId oldestMulti; /* cluster-wide minimum datminmxid */
Oid oldestMultiDB; /* database with minimum datminmxid */
pg_time_t time; /* time stamp of checkpoint */
TransactionId oldestCommitTsXid; /* oldest Xid with valid commit
* timestamp */
TransactionId newestCommitTsXid; /* newest Xid with valid commit
* timestamp */
/*
* Oldest XID still running. This is only needed to initialize hot standby
* mode from an online checkpoint, so we only bother calculating this for
* online checkpoints and only when wal_level is replica. Otherwise it's
* set to InvalidTransactionId.
*/
TransactionId oldestActiveXid;
} CheckPoint;
/* XLOG info values for XLOG rmgr */
#define XLOG_CHECKPOINT_SHUTDOWN 0x00
#define XLOG_CHECKPOINT_ONLINE 0x10
#define XLOG_NOOP 0x20
#define XLOG_NEXTOID 0x30
#define XLOG_SWITCH 0x40
#define XLOG_BACKUP_END 0x50
#define XLOG_PARAMETER_CHANGE 0x60
#define XLOG_RESTORE_POINT 0x70
#define XLOG_FPW_CHANGE 0x80
#define XLOG_END_OF_RECOVERY 0x90
#define XLOG_FPI_FOR_HINT 0xA0
#define XLOG_FPI 0xB0
/*
* System status indicator. Note this is stored in pg_control; if you change
* it, you must bump PG_CONTROL_VERSION
*/
typedef enum DBState
{
DB_STARTUP = 0,
DB_SHUTDOWNED,
DB_SHUTDOWNED_IN_RECOVERY,
DB_SHUTDOWNING,
DB_IN_CRASH_RECOVERY,
DB_IN_ARCHIVE_RECOVERY,
DB_IN_PRODUCTION
} DBState;
/*
* Contents of pg_control.
*/
typedef struct ControlFileData
{
/*
* Unique system identifier --- to ensure we match up xlog files with the
* installation that produced them.
*/
uint64 system_identifier;
/*
* Version identifier information. Keep these fields at the same offset,
* especially pg_control_version; they won't be real useful if they move
* around. (For historical reasons they must be 8 bytes into the file
* rather than immediately at the front.)
*
* pg_control_version identifies the format of pg_control itself.
* catalog_version_no identifies the format of the system catalogs.
*
* There are additional version identifiers in individual files; for
* example, WAL logs contain per-page magic numbers that can serve as
* version cues for the WAL log.
*/
uint32 pg_control_version; /* PG_CONTROL_VERSION */
uint32 catalog_version_no; /* see catversion.h */
/*
* System status data
*/
DBState state; /* see enum above */
pg_time_t time; /* time stamp of last pg_control update */
XLogRecPtr checkPoint; /* last check point record ptr */
XLogRecPtr prevCheckPoint; /* previous check point record ptr */
CheckPoint checkPointCopy; /* copy of last check point record */
XLogRecPtr unloggedLSN; /* current fake LSN value, for unlogged rels */
/*
* These two values determine the minimum point we must recover up to
* before starting up:
*
* minRecoveryPoint is updated to the latest replayed LSN whenever we
* flush a data change during archive recovery. That guards against
* starting archive recovery, aborting it, and restarting with an earlier
* stop location. If we've already flushed data changes from WAL record X
* to disk, we mustn't start up until we reach X again. Zero when not
* doing archive recovery.
*
* backupStartPoint is the redo pointer of the backup start checkpoint, if
* we are recovering from an online backup and haven't reached the end of
* backup yet. It is reset to zero when the end of backup is reached, and
* we mustn't start up before that. A boolean would suffice otherwise, but
* we use the redo pointer as a cross-check when we see an end-of-backup
* record, to make sure the end-of-backup record corresponds the base
* backup we're recovering from.
*
* backupEndPoint is the backup end location, if we are recovering from an
* online backup which was taken from the standby and haven't reached the
* end of backup yet. It is initialized to the minimum recovery point in
* pg_control which was backed up last. It is reset to zero when the end
* of backup is reached, and we mustn't start up before that.
*
* If backupEndRequired is true, we know for sure that we're restoring
* from a backup, and must see a backup-end record before we can safely
* start up. If it's false, but backupStartPoint is set, a backup_label
* file was found at startup but it may have been a leftover from a stray
* pg_start_backup() call, not accompanied by pg_stop_backup().
*/
XLogRecPtr minRecoveryPoint;
TimeLineID minRecoveryPointTLI;
XLogRecPtr backupStartPoint;
XLogRecPtr backupEndPoint;
bool backupEndRequired;
/*
* Parameter settings that determine if the WAL can be used for archival
* or hot standby.
*/
int wal_level;
bool wal_log_hints;
int MaxConnections;
int max_worker_processes;
int max_prepared_xacts;
int max_locks_per_xact;
bool track_commit_timestamp;
/*
* This data is used to check for hardware-architecture compatibility of
* the database and the backend executable. We need not check endianness
* explicitly, since the pg_control version will surely look wrong to a
* machine of different endianness, but we do need to worry about MAXALIGN
* and floating-point format. (Note: storage layout nominally also
* depends on SHORTALIGN and INTALIGN, but in practice these are the same
* on all architectures of interest.)
*
* Testing just one double value is not a very bulletproof test for
* floating-point compatibility, but it will catch most cases.
*/
uint32 maxAlign; /* alignment requirement for tuples */
double floatFormat; /* constant 1234567.0 */
#define FLOATFORMAT_VALUE 1234567.0
/*
* This data is used to make sure that configuration of this database is
* compatible with the backend executable.
*/
uint32 blcksz; /* data block size for this DB */
uint32 relseg_size; /* blocks per segment of large relation */
uint32 xlog_blcksz; /* block size within WAL files */
uint32 xlog_seg_size; /* size of each WAL segment */
uint32 nameDataLen; /* catalog name field width */
uint32 indexMaxKeys; /* max number of columns in an index */
uint32 toast_max_chunk_size; /* chunk size in TOAST tables */
uint32 loblksize; /* chunk size in pg_largeobject */
/* flags indicating pass-by-value status of various types */
bool float4ByVal; /* float4 pass-by-value? */
bool float8ByVal; /* float8, int8, etc pass-by-value? */
/* Are data pages protected by checksums? Zero if no checksum version */
uint32 data_checksum_version;
/*
* Random nonce, used in authentication requests that need to proceed
* based on values that are cluster-unique, like a SASL exchange that
* failed at an early stage.
*/
char mock_authentication_nonce[MOCK_AUTH_NONCE_LEN];
/* CRC of all above ... MUST BE LAST! */
pg_crc32c crc;
} ControlFileData;
/*
* Maximum safe value of sizeof(ControlFileData). For reliability's sake,
* it's critical that pg_control updates be atomic writes. That generally
* means the active data can't be more than one disk sector, which is 512
* bytes on common hardware. Be very careful about raising this limit.
*/
#define PG_CONTROL_MAX_SAFE_SIZE 512
/*
* Physical size of the pg_control file. Note that this is considerably
* bigger than the actually used size (ie, sizeof(ControlFileData)).
* The idea is to keep the physical size constant independent of format
* changes, so that ReadControlFile will deliver a suitable wrong-version
* message instead of a read error if it's looking at an incompatible file.
*/
#define PG_CONTROL_FILE_SIZE 8192
#endif /* PG_CONTROL_H */