2000-10-28 18:21:00 +02:00
|
|
|
/*
|
|
|
|
* xlogdefs.h
|
|
|
|
*
|
2017-05-12 17:49:56 +02:00
|
|
|
* Postgres write-ahead log manager record pointer and
|
2004-07-22 00:31:26 +02:00
|
|
|
* timeline number definitions
|
XLOG (and related) changes:
* Store two past checkpoint locations, not just one, in pg_control.
On startup, we fall back to the older checkpoint if the newer one
is unreadable. Also, a physical copy of the newest checkpoint record
is kept in pg_control for possible use in disaster recovery (ie,
complete loss of pg_xlog). Also add a version number for pg_control
itself. Remove archdir from pg_control; it ought to be a GUC
parameter, not a special case (not that it's implemented yet anyway).
* Suppress successive checkpoint records when nothing has been entered
in the WAL log since the last one. This is not so much to avoid I/O
as to make it actually useful to keep track of the last two
checkpoints. If the things are right next to each other then there's
not a lot of redundancy gained...
* Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs
on alternate bytes. Polynomial borrowed from ECMA DLT1 standard.
* Fix XLOG record length handling so that it will work at BLCKSZ = 32k.
* Change XID allocation to work more like OID allocation. (This is of
dubious necessity, but I think it's a good idea anyway.)
* Fix a number of minor bugs, such as off-by-one logic for XLOG file
wraparound at the 4 gig mark.
* Add documentation and clean up some coding infelicities; move file
format declarations out to include files where planned contrib
utilities can get at them.
* Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or
every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also
possible to force a checkpoint by sending SIGUSR1 to the postmaster
(undocumented feature...)
* Defend against kill -9 postmaster by storing shmem block's key and ID
in postmaster.pid lockfile, and checking at startup to ensure that no
processes are still connected to old shmem block (if it still exists).
* Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency
stop, for symmetry with postmaster and xlog utilities. Clean up signal
handling in bootstrap.c so that xlog utilities launched by postmaster
will react to signals better.
* Standalone bootstrap now grabs lockfile in target directory, as added
insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
|
|
|
*
|
2017-01-03 19:48:53 +01:00
|
|
|
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
XLOG (and related) changes:
* Store two past checkpoint locations, not just one, in pg_control.
On startup, we fall back to the older checkpoint if the newer one
is unreadable. Also, a physical copy of the newest checkpoint record
is kept in pg_control for possible use in disaster recovery (ie,
complete loss of pg_xlog). Also add a version number for pg_control
itself. Remove archdir from pg_control; it ought to be a GUC
parameter, not a special case (not that it's implemented yet anyway).
* Suppress successive checkpoint records when nothing has been entered
in the WAL log since the last one. This is not so much to avoid I/O
as to make it actually useful to keep track of the last two
checkpoints. If the things are right next to each other then there's
not a lot of redundancy gained...
* Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs
on alternate bytes. Polynomial borrowed from ECMA DLT1 standard.
* Fix XLOG record length handling so that it will work at BLCKSZ = 32k.
* Change XID allocation to work more like OID allocation. (This is of
dubious necessity, but I think it's a good idea anyway.)
* Fix a number of minor bugs, such as off-by-one logic for XLOG file
wraparound at the 4 gig mark.
* Add documentation and clean up some coding infelicities; move file
format declarations out to include files where planned contrib
utilities can get at them.
* Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or
every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also
possible to force a checkpoint by sending SIGUSR1 to the postmaster
(undocumented feature...)
* Defend against kill -9 postmaster by storing shmem block's key and ID
in postmaster.pid lockfile, and checking at startup to ensure that no
processes are still connected to old shmem block (if it still exists).
* Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency
stop, for symmetry with postmaster and xlog utilities. Clean up signal
handling in bootstrap.c so that xlog utilities launched by postmaster
will react to signals better.
* Standalone bootstrap now grabs lockfile in target directory, as added
insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
2000-10-28 18:21:00 +02:00
|
|
|
*
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/include/access/xlogdefs.h
|
2000-10-28 18:21:00 +02:00
|
|
|
*/
|
|
|
|
#ifndef XLOG_DEFS_H
|
|
|
|
#define XLOG_DEFS_H
|
|
|
|
|
2008-05-17 19:24:57 +02:00
|
|
|
#include <fcntl.h> /* need open() flags */
|
|
|
|
|
XLOG (and related) changes:
* Store two past checkpoint locations, not just one, in pg_control.
On startup, we fall back to the older checkpoint if the newer one
is unreadable. Also, a physical copy of the newest checkpoint record
is kept in pg_control for possible use in disaster recovery (ie,
complete loss of pg_xlog). Also add a version number for pg_control
itself. Remove archdir from pg_control; it ought to be a GUC
parameter, not a special case (not that it's implemented yet anyway).
* Suppress successive checkpoint records when nothing has been entered
in the WAL log since the last one. This is not so much to avoid I/O
as to make it actually useful to keep track of the last two
checkpoints. If the things are right next to each other then there's
not a lot of redundancy gained...
* Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs
on alternate bytes. Polynomial borrowed from ECMA DLT1 standard.
* Fix XLOG record length handling so that it will work at BLCKSZ = 32k.
* Change XID allocation to work more like OID allocation. (This is of
dubious necessity, but I think it's a good idea anyway.)
* Fix a number of minor bugs, such as off-by-one logic for XLOG file
wraparound at the 4 gig mark.
* Add documentation and clean up some coding infelicities; move file
format declarations out to include files where planned contrib
utilities can get at them.
* Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or
every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also
possible to force a checkpoint by sending SIGUSR1 to the postmaster
(undocumented feature...)
* Defend against kill -9 postmaster by storing shmem block's key and ID
in postmaster.pid lockfile, and checking at startup to ensure that no
processes are still connected to old shmem block (if it still exists).
* Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency
stop, for symmetry with postmaster and xlog utilities. Clean up signal
handling in bootstrap.c so that xlog utilities launched by postmaster
will react to signals better.
* Standalone bootstrap now grabs lockfile in target directory, as added
insurance against running it in parallel with live postmaster.
2001-03-13 02:17:06 +01:00
|
|
|
/*
|
|
|
|
* Pointer to a location in the XLOG. These pointers are 64 bits wide,
|
|
|
|
* because we don't want them ever to overflow.
|
|
|
|
*/
|
2012-06-24 17:51:37 +02:00
|
|
|
typedef uint64 XLogRecPtr;
|
2007-08-02 00:45:09 +02:00
|
|
|
|
2012-06-24 17:51:37 +02:00
|
|
|
/*
|
|
|
|
* Zero is used indicate an invalid pointer. Bootstrap skips the first possible
|
|
|
|
* WAL segment, initializing the first WAL page at XLOG_SEG_SIZE, so no XLOG
|
|
|
|
* record can begin at zero.
|
|
|
|
*/
|
|
|
|
#define InvalidXLogRecPtr 0
|
|
|
|
#define XLogRecPtrIsInvalid(r) ((r) == InvalidXLogRecPtr)
|
2003-12-20 18:31:21 +01:00
|
|
|
|
2012-06-24 17:06:38 +02:00
|
|
|
/*
|
|
|
|
* XLogSegNo - physical log file sequence number.
|
|
|
|
*/
|
|
|
|
typedef uint64 XLogSegNo;
|
2010-01-15 10:19:10 +01:00
|
|
|
|
2000-10-28 18:21:00 +02:00
|
|
|
/*
|
2004-07-22 00:31:26 +02:00
|
|
|
* TimeLineID (TLI) - identifies different database histories to prevent
|
|
|
|
* confusion after restoring a prior state of a database installation.
|
|
|
|
* TLI does not change in a normal stop/restart of the database (including
|
|
|
|
* crash-and-recover cases); but we must assign a new TLI after doing
|
|
|
|
* a recovery to a prior state, a/k/a point-in-time recovery. This makes
|
|
|
|
* the new WAL logfile sequence we generate distinguishable from the
|
|
|
|
* sequence that was generated in the previous incarnation.
|
2000-10-28 18:21:00 +02:00
|
|
|
*/
|
2004-07-22 00:31:26 +02:00
|
|
|
typedef uint32 TimeLineID;
|
2001-10-28 07:26:15 +01:00
|
|
|
|
Introduce replication progress tracking infrastructure.
When implementing a replication solution ontop of logical decoding, two
related problems exist:
* How to safely keep track of replication progress
* How to change replication behavior, based on the origin of a row;
e.g. to avoid loops in bi-directional replication setups
The solution to these problems, as implemented here, consist out of
three parts:
1) 'replication origins', which identify nodes in a replication setup.
2) 'replication progress tracking', which remembers, for each
replication origin, how far replay has progressed in a efficient and
crash safe manner.
3) The ability to filter out changes performed on the behest of a
replication origin during logical decoding; this allows complex
replication topologies. E.g. by filtering all replayed changes out.
Most of this could also be implemented in "userspace", e.g. by inserting
additional rows contain origin information, but that ends up being much
less efficient and more complicated. We don't want to require various
replication solutions to reimplement logic for this independently. The
infrastructure is intended to be generic enough to be reusable.
This infrastructure also replaces the 'nodeid' infrastructure of commit
timestamps. It is intended to provide all the former capabilities,
except that there's only 2^16 different origins; but now they integrate
with logical decoding. Additionally more functionality is accessible via
SQL. Since the commit timestamp infrastructure has also been introduced
in 9.5 (commit 73c986add) changing the API is not a problem.
For now the number of origins for which the replication progress can be
tracked simultaneously is determined by the max_replication_slots
GUC. That GUC is not a perfect match to configure this, but there
doesn't seem to be sufficient reason to introduce a separate new one.
Bumps both catversion and wal page magic.
Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer
Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer
Discussion: 20150216002155.GI15326@awork2.anarazel.de,
20140923182422.GA15776@alap3.anarazel.de,
20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
|
|
|
/*
|
|
|
|
* Replication origin id - this is located in this file to avoid having to
|
|
|
|
* include origin.h in a bunch of xlog related places.
|
|
|
|
*/
|
|
|
|
typedef uint16 RepOriginId;
|
|
|
|
|
2007-02-14 06:00:40 +01:00
|
|
|
/*
|
|
|
|
* Because O_DIRECT bypasses the kernel buffers, and because we never
|
2011-03-11 12:46:44 +01:00
|
|
|
* read those buffers except during crash recovery or if wal_level != minimal,
|
|
|
|
* it is a win to use it in all cases where we sync on each write(). We could
|
|
|
|
* allow O_DIRECT with fsync(), but it is unclear if fsync() could process
|
2014-05-06 18:12:18 +02:00
|
|
|
* writes not buffered in the kernel. Also, O_DIRECT is never enough to force
|
2011-03-11 12:46:44 +01:00
|
|
|
* data to the drives, it merely tries to bypass the kernel cache, so we still
|
|
|
|
* need O_SYNC/O_DSYNC.
|
2007-02-14 06:00:40 +01:00
|
|
|
*/
|
|
|
|
#ifdef O_DIRECT
|
|
|
|
#define PG_O_DIRECT O_DIRECT
|
|
|
|
#else
|
|
|
|
#define PG_O_DIRECT 0
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This chunk of hackery attempts to determine which file sync methods
|
|
|
|
* are available on the current platform, and to choose an appropriate
|
2014-05-06 18:12:18 +02:00
|
|
|
* default method. We assume that fsync() is always available, and that
|
2007-02-14 06:00:40 +01:00
|
|
|
* configure determined whether fdatasync() is.
|
|
|
|
*/
|
|
|
|
#if defined(O_SYNC)
|
2010-02-19 11:51:04 +01:00
|
|
|
#define OPEN_SYNC_FLAG O_SYNC
|
2007-02-14 06:00:40 +01:00
|
|
|
#elif defined(O_FSYNC)
|
2010-02-19 11:51:04 +01:00
|
|
|
#define OPEN_SYNC_FLAG O_FSYNC
|
2007-02-14 06:00:40 +01:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(O_DSYNC)
|
|
|
|
#if defined(OPEN_SYNC_FLAG)
|
|
|
|
/* O_DSYNC is distinct? */
|
2010-02-19 11:51:04 +01:00
|
|
|
#if O_DSYNC != OPEN_SYNC_FLAG
|
|
|
|
#define OPEN_DATASYNC_FLAG O_DSYNC
|
2007-02-14 06:00:40 +01:00
|
|
|
#endif
|
|
|
|
#else /* !defined(OPEN_SYNC_FLAG) */
|
|
|
|
/* Win32 only has O_DSYNC */
|
2010-02-19 11:51:04 +01:00
|
|
|
#define OPEN_DATASYNC_FLAG O_DSYNC
|
2007-02-14 06:00:40 +01:00
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
2010-12-09 02:01:09 +01:00
|
|
|
#if defined(PLATFORM_DEFAULT_SYNC_METHOD)
|
|
|
|
#define DEFAULT_SYNC_METHOD PLATFORM_DEFAULT_SYNC_METHOD
|
|
|
|
#elif defined(OPEN_DATASYNC_FLAG)
|
2008-05-12 10:35:05 +02:00
|
|
|
#define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN_DSYNC
|
2007-02-14 06:00:40 +01:00
|
|
|
#elif defined(HAVE_FDATASYNC)
|
|
|
|
#define DEFAULT_SYNC_METHOD SYNC_METHOD_FDATASYNC
|
|
|
|
#else
|
|
|
|
#define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC
|
|
|
|
#endif
|
|
|
|
|
2001-11-05 18:46:40 +01:00
|
|
|
#endif /* XLOG_DEFS_H */
|