2005-06-18 00:32:51 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* twophase.c
|
|
|
|
* Two-phase commit support functions.
|
|
|
|
*
|
2017-01-03 19:48:53 +01:00
|
|
|
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
2005-06-18 00:32:51 +02:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/access/transam/twophase.c
|
2005-06-18 00:32:51 +02:00
|
|
|
*
|
|
|
|
* NOTES
|
|
|
|
* Each global transaction is associated with a global transaction
|
|
|
|
* identifier (GID). The client assigns a GID to a postgres
|
|
|
|
* transaction with the PREPARE TRANSACTION command.
|
|
|
|
*
|
|
|
|
* We keep all active global transactions in a shared memory array.
|
|
|
|
* When the PREPARE TRANSACTION command is issued, the GID is
|
|
|
|
* reserved for the transaction in the array. This is done before
|
|
|
|
* a WAL entry is made, because the reservation checks for duplicate
|
|
|
|
* GIDs and aborts the transaction if there already is a global
|
|
|
|
* transaction in prepared state with the same GID.
|
|
|
|
*
|
2012-05-14 09:22:44 +02:00
|
|
|
* A global transaction (gxact) also has dummy PGXACT and PGPROC; this is
|
|
|
|
* what keeps the XID considered running by TransactionIdIsInProgress.
|
|
|
|
* It is also convenient as a PGPROC to hook the gxact's locks to.
|
2005-06-18 00:32:51 +02:00
|
|
|
*
|
2016-01-21 03:40:44 +01:00
|
|
|
* Information to recover prepared transactions in case of crash is
|
|
|
|
* now stored in WAL for the common case. In some cases there will be
|
|
|
|
* an extended period between preparing a GXACT and commit/abort, in
|
|
|
|
* which case we need to separately record prepared transaction data
|
|
|
|
* in permanent storage. This includes locking information, pending
|
|
|
|
* notifications etc. All that state information is written to the
|
|
|
|
* per-transaction state file in the pg_twophase directory.
|
|
|
|
* All prepared transactions will be written prior to shutdown.
|
|
|
|
*
|
|
|
|
* Life track of state data is following:
|
|
|
|
*
|
|
|
|
* * On PREPARE TRANSACTION backend writes state data only to the WAL and
|
|
|
|
* stores pointer to the start of the WAL record in
|
|
|
|
* gxact->prepare_start_lsn.
|
|
|
|
* * If COMMIT occurs before checkpoint then backend reads data from WAL
|
|
|
|
* using prepare_start_lsn.
|
|
|
|
* * On checkpoint state data copied to files in pg_twophase directory and
|
|
|
|
* fsynced
|
|
|
|
* * If COMMIT happens after checkpoint then backend reads state data from
|
|
|
|
* files
|
2017-04-04 21:56:56 +02:00
|
|
|
*
|
|
|
|
* During replay and replication, TwoPhaseState also holds information
|
|
|
|
* about active prepared transactions that haven't been moved to disk yet.
|
|
|
|
*
|
|
|
|
* Replay of twophase records happens by the following rules:
|
|
|
|
*
|
|
|
|
* * At the beginning of recovery, pg_twophase is scanned once, filling
|
|
|
|
* TwoPhaseState with entries marked with gxact->inredo and
|
|
|
|
* gxact->ondisk. Two-phase file data older than the XID horizon of
|
|
|
|
* the redo position are discarded.
|
|
|
|
* * On PREPARE redo, the transaction is added to TwoPhaseState->prepXacts.
|
|
|
|
* gxact->inredo is set to true for such entries.
|
|
|
|
* * On Checkpoint we iterate through TwoPhaseState->prepXacts entries
|
|
|
|
* that have gxact->inredo set and are behind the redo_horizon. We
|
|
|
|
* save them to disk and then switch gxact->ondisk to true.
|
|
|
|
* * On COMMIT/ABORT we delete the entry from TwoPhaseState->prepXacts.
|
|
|
|
* If gxact->ondisk is true, the corresponding entry from the disk
|
|
|
|
* is additionally deleted.
|
|
|
|
* * RecoverPreparedTransactions(), StandbyRecoverPreparedTransactions()
|
|
|
|
* and PrescanPreparedTransactions() have been modified to go through
|
|
|
|
* gxact->inredo entries that have not made it to disk.
|
2005-06-18 00:32:51 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
|
|
|
|
#include <fcntl.h>
|
2005-06-20 00:34:56 +02:00
|
|
|
#include <sys/stat.h>
|
|
|
|
#include <time.h>
|
2005-06-18 00:32:51 +02:00
|
|
|
#include <unistd.h>
|
|
|
|
|
2015-09-29 19:40:56 +02:00
|
|
|
#include "access/commit_ts.h"
|
2012-08-30 22:15:44 +02:00
|
|
|
#include "access/htup_details.h"
|
2005-06-18 00:32:51 +02:00
|
|
|
#include "access/subtrans.h"
|
2006-07-13 18:49:20 +02:00
|
|
|
#include "access/transam.h"
|
2005-06-18 00:32:51 +02:00
|
|
|
#include "access/twophase.h"
|
|
|
|
#include "access/twophase_rmgr.h"
|
|
|
|
#include "access/xact.h"
|
Add new wal_level, logical, sufficient for logical decoding.
When wal_level=logical, we'll log columns from the old tuple as
configured by the REPLICA IDENTITY facility added in commit
07cacba983ef79be4a84fcd0e0ca3b5fcb85dd65. This makes it possible
a properly-configured logical replication solution to correctly
follow table updates even if they change the chosen key columns,
or, with REPLICA IDENTITY FULL, even if the table has no key at
all. Note that updates which do not modify the replica identity
column won't log anything extra, making the choice of a good key
(i.e. one that will rarely be changed) important to performance
when wal_level=logical is configured.
Each insert, update, or delete to a catalog table will also log
the CMIN and/or CMAX values of stamped by the current transaction.
This is necessary because logical decoding will require access to
historical snapshots of the catalog in order to decode some data
types, and the CMIN/CMAX values that we may need in order to judge
row visibility may have been overwritten by the time we need them.
Andres Freund, reviewed in various versions by myself, Heikki
Linnakangas, KONDO Mitsumasa, and many others.
2013-12-11 00:33:45 +01:00
|
|
|
#include "access/xlog.h"
|
2014-11-06 12:52:08 +01:00
|
|
|
#include "access/xloginsert.h"
|
2008-11-19 11:34:52 +01:00
|
|
|
#include "access/xlogutils.h"
|
2016-01-21 03:40:44 +01:00
|
|
|
#include "access/xlogreader.h"
|
2005-06-18 00:32:51 +02:00
|
|
|
#include "catalog/pg_type.h"
|
2008-11-19 11:34:52 +01:00
|
|
|
#include "catalog/storage.h"
|
2005-06-18 00:32:51 +02:00
|
|
|
#include "funcapi.h"
|
|
|
|
#include "miscadmin.h"
|
2008-08-01 15:16:09 +02:00
|
|
|
#include "pg_trace.h"
|
2005-06-19 22:00:39 +02:00
|
|
|
#include "pgstat.h"
|
2015-09-29 19:40:56 +02:00
|
|
|
#include "replication/origin.h"
|
2011-03-06 23:49:16 +01:00
|
|
|
#include "replication/syncrep.h"
|
2015-09-29 19:40:56 +02:00
|
|
|
#include "replication/walsender.h"
|
2005-06-18 00:32:51 +02:00
|
|
|
#include "storage/fd.h"
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
#include "storage/ipc.h"
|
Implement genuine serializable isolation level.
Until now, our Serializable mode has in fact been what's called Snapshot
Isolation, which allows some anomalies that could not occur in any
serialized ordering of the transactions. This patch fixes that using a
method called Serializable Snapshot Isolation, based on research papers by
Michael J. Cahill (see README-SSI for full references). In Serializable
Snapshot Isolation, transactions run like they do in Snapshot Isolation,
but a predicate lock manager observes the reads and writes performed and
aborts transactions if it detects that an anomaly might occur. This method
produces some false positives, ie. it sometimes aborts transactions even
though there is no anomaly.
To track reads we implement predicate locking, see storage/lmgr/predicate.c.
Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared
memory is finite, so when a transaction takes many tuple-level locks on a
page, the locks are promoted to a single page-level lock, and further to a
single relation level lock if necessary. To lock key values with no matching
tuple, a sequential scan always takes a relation-level lock, and an index
scan acquires a page-level lock that covers the search key, whether or not
there are any matching keys at the moment.
A predicate lock doesn't conflict with any regular locks or with another
predicate locks in the normal sense. They're only used by the predicate lock
manager to detect the danger of anomalies. Only serializable transactions
participate in predicate locking, so there should be no extra overhead for
for other transactions.
Predicate locks can't be released at commit, but must be remembered until
all the transactions that overlapped with it have completed. That means that
we need to remember an unbounded amount of predicate locks, so we apply a
lossy but conservative method of tracking locks for committed transactions.
If we run short of shared memory, we overflow to a new "pg_serial" SLRU
pool.
We don't currently allow Serializable transactions in Hot Standby mode.
That would be hard, because even read-only transactions can cause anomalies
that wouldn't otherwise occur.
Serializable isolation mode now means the new fully serializable level.
Repeatable Read gives you the old Snapshot Isolation level that we have
always had.
Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and
Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
|
|
|
#include "storage/predicate.h"
|
2012-06-25 23:45:15 +02:00
|
|
|
#include "storage/proc.h"
|
2005-06-18 00:32:51 +02:00
|
|
|
#include "storage/procarray.h"
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
#include "storage/sinvaladt.h"
|
2005-06-18 00:32:51 +02:00
|
|
|
#include "storage/smgr.h"
|
|
|
|
#include "utils/builtins.h"
|
2008-05-19 20:16:26 +02:00
|
|
|
#include "utils/memutils.h"
|
2011-09-09 19:23:41 +02:00
|
|
|
#include "utils/timestamp.h"
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Directory where Two-phase commit files reside within PGDATA
|
|
|
|
*/
|
|
|
|
#define TWOPHASE_DIR "pg_twophase"
|
|
|
|
|
|
|
|
/* GUC variable, can't be changed after startup */
|
2009-04-23 02:23:46 +02:00
|
|
|
int max_prepared_xacts = 0;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This struct describes one global transaction that is in prepared state
|
|
|
|
* or attempting to become prepared.
|
|
|
|
*
|
|
|
|
* The lifecycle of a global transaction is:
|
|
|
|
*
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
* 1. After checking that the requested GID is not in use, set up an entry in
|
|
|
|
* the TwoPhaseState->prepXacts array with the correct GID and valid = false,
|
|
|
|
* and mark it as locked by my backend.
|
2005-06-18 00:32:51 +02:00
|
|
|
*
|
|
|
|
* 2. After successfully completing prepare, set valid = true and enter the
|
2012-05-14 09:22:44 +02:00
|
|
|
* referenced PGPROC into the global ProcArray.
|
2005-06-18 00:32:51 +02:00
|
|
|
*
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
* 3. To begin COMMIT PREPARED or ROLLBACK PREPARED, check that the entry is
|
|
|
|
* valid and not locked, then mark the entry as locked by storing my current
|
|
|
|
* backend ID into locking_backend. This prevents concurrent attempts to
|
|
|
|
* commit or rollback the same prepared xact.
|
2005-06-18 00:32:51 +02:00
|
|
|
*
|
|
|
|
* 4. On completion of COMMIT PREPARED or ROLLBACK PREPARED, remove the entry
|
|
|
|
* from the ProcArray and the TwoPhaseState->prepXacts array and return it to
|
|
|
|
* the freelist.
|
|
|
|
*
|
|
|
|
* Note that if the preparing transaction fails between steps 1 and 2, the
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
* entry must be removed so that the GID and the GlobalTransaction struct
|
|
|
|
* can be reused. See AtAbort_Twophase().
|
2005-06-18 00:32:51 +02:00
|
|
|
*
|
2005-10-15 04:49:52 +02:00
|
|
|
* typedef struct GlobalTransactionData *GlobalTransaction appears in
|
2005-06-18 00:32:51 +02:00
|
|
|
* twophase.h
|
2016-03-10 13:51:46 +01:00
|
|
|
*
|
|
|
|
* Note that the max value of GIDSIZE must fit in the uint16 gidlen,
|
|
|
|
* specified in TwoPhaseFileHeader.
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
|
|
|
#define GIDSIZE 200
|
|
|
|
|
|
|
|
typedef struct GlobalTransactionData
|
|
|
|
{
|
2012-08-08 17:52:02 +02:00
|
|
|
GlobalTransaction next; /* list link for free list */
|
|
|
|
int pgprocno; /* ID of associated dummy PGPROC */
|
2010-02-26 03:01:40 +01:00
|
|
|
BackendId dummyBackendId; /* similar to backend id for backends */
|
2005-10-15 04:49:52 +02:00
|
|
|
TimestampTz prepared_at; /* time of preparation */
|
2016-01-21 03:40:44 +01:00
|
|
|
|
|
|
|
/*
|
2016-06-10 00:02:36 +02:00
|
|
|
* Note that we need to keep track of two LSNs for each GXACT. We keep
|
|
|
|
* track of the start LSN because this is the address we must use to read
|
|
|
|
* state data back from WAL when committing a prepared GXACT. We keep
|
|
|
|
* track of the end LSN because that is the LSN we need to wait for prior
|
|
|
|
* to commit.
|
2016-01-21 03:40:44 +01:00
|
|
|
*/
|
2016-06-10 00:02:36 +02:00
|
|
|
XLogRecPtr prepare_start_lsn; /* XLOG offset of prepare record start */
|
2016-01-21 03:40:44 +01:00
|
|
|
XLogRecPtr prepare_end_lsn; /* XLOG offset of prepare record end */
|
2017-04-04 21:56:56 +02:00
|
|
|
TransactionId xid; /* The GXACT id */
|
2016-01-21 03:40:44 +01:00
|
|
|
|
2005-06-28 07:09:14 +02:00
|
|
|
Oid owner; /* ID of user that executed the xact */
|
2015-05-24 03:35:49 +02:00
|
|
|
BackendId locking_backend; /* backend currently working on the xact */
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
bool valid; /* TRUE if PGPROC entry is in proc array */
|
2016-01-21 03:40:44 +01:00
|
|
|
bool ondisk; /* TRUE if prepare state file is on disk */
|
2017-04-04 21:56:56 +02:00
|
|
|
bool inredo; /* TRUE if entry was added via xlog_redo */
|
2005-10-15 04:49:52 +02:00
|
|
|
char gid[GIDSIZE]; /* The GID assigned to the prepared xact */
|
2011-04-10 17:42:00 +02:00
|
|
|
} GlobalTransactionData;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Two Phase Commit shared state. Access to this struct is protected
|
|
|
|
* by TwoPhaseStateLock.
|
|
|
|
*/
|
|
|
|
typedef struct TwoPhaseStateData
|
|
|
|
{
|
|
|
|
/* Head of linked list of free GlobalTransactionData structs */
|
2008-11-02 22:24:52 +01:00
|
|
|
GlobalTransaction freeGXacts;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/* Number of valid prepXacts entries. */
|
2005-10-15 04:49:52 +02:00
|
|
|
int numPrepXacts;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2015-02-20 23:32:01 +01:00
|
|
|
/* There are max_prepared_xacts items in this array */
|
|
|
|
GlobalTransaction prepXacts[FLEXIBLE_ARRAY_MEMBER];
|
|
|
|
} TwoPhaseStateData;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
static TwoPhaseStateData *TwoPhaseState;
|
|
|
|
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
/*
|
|
|
|
* Global transaction entry currently locked by us, if any.
|
|
|
|
*/
|
2017-03-10 20:49:56 +01:00
|
|
|
static GlobalTransaction MyLockedGxact = NULL;
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
|
|
|
|
static bool twophaseExitRegistered = false;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
static void RecordTransactionCommitPrepared(TransactionId xid,
|
2005-10-15 04:49:52 +02:00
|
|
|
int nchildren,
|
|
|
|
TransactionId *children,
|
|
|
|
int nrels,
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
RelFileNode *rels,
|
|
|
|
int ninvalmsgs,
|
|
|
|
SharedInvalidationMessage *invalmsgs,
|
|
|
|
bool initfileinval);
|
2005-06-18 00:32:51 +02:00
|
|
|
static void RecordTransactionAbortPrepared(TransactionId xid,
|
2005-10-15 04:49:52 +02:00
|
|
|
int nchildren,
|
|
|
|
TransactionId *children,
|
|
|
|
int nrels,
|
2008-11-19 11:34:52 +01:00
|
|
|
RelFileNode *rels);
|
2005-06-18 00:32:51 +02:00
|
|
|
static void ProcessRecords(char *bufptr, TransactionId xid,
|
2005-10-15 04:49:52 +02:00
|
|
|
const TwoPhaseCallback callbacks[]);
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
static void RemoveGXact(GlobalTransaction gxact);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2016-01-21 03:40:44 +01:00
|
|
|
static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len);
|
2017-04-04 21:56:56 +02:00
|
|
|
static char *ProcessTwoPhaseBuffer(TransactionId xid,
|
|
|
|
XLogRecPtr prepare_start_lsn,
|
|
|
|
bool fromdisk, bool overwriteOK, bool setParent,
|
2017-04-18 12:14:05 +02:00
|
|
|
bool setNextXid);
|
2017-04-04 21:56:56 +02:00
|
|
|
static void MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid,
|
|
|
|
const char *gid, TimestampTz prepared_at, Oid owner,
|
|
|
|
Oid databaseid);
|
|
|
|
static void RemoveTwoPhaseFile(TransactionId xid, bool giveWarning);
|
|
|
|
static void RecreateTwoPhaseFile(TransactionId xid, void *content, int len);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialization of shared memory
|
|
|
|
*/
|
2005-08-21 01:26:37 +02:00
|
|
|
Size
|
2005-06-18 00:32:51 +02:00
|
|
|
TwoPhaseShmemSize(void)
|
|
|
|
{
|
2005-08-21 01:26:37 +02:00
|
|
|
Size size;
|
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
/* Need the fixed struct, the array of pointers, and the GTD structs */
|
2005-08-21 01:26:37 +02:00
|
|
|
size = offsetof(TwoPhaseStateData, prepXacts);
|
|
|
|
size = add_size(size, mul_size(max_prepared_xacts,
|
|
|
|
sizeof(GlobalTransaction)));
|
|
|
|
size = MAXALIGN(size);
|
|
|
|
size = add_size(size, mul_size(max_prepared_xacts,
|
|
|
|
sizeof(GlobalTransactionData)));
|
|
|
|
|
|
|
|
return size;
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
TwoPhaseShmemInit(void)
|
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
bool found;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
TwoPhaseState = ShmemInitStruct("Prepared Transaction Table",
|
|
|
|
TwoPhaseShmemSize(),
|
|
|
|
&found);
|
|
|
|
if (!IsUnderPostmaster)
|
|
|
|
{
|
|
|
|
GlobalTransaction gxacts;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
Assert(!found);
|
2008-11-02 22:24:52 +01:00
|
|
|
TwoPhaseState->freeGXacts = NULL;
|
2005-06-18 00:32:51 +02:00
|
|
|
TwoPhaseState->numPrepXacts = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize the linked list of free GlobalTransactionData structs
|
|
|
|
*/
|
|
|
|
gxacts = (GlobalTransaction)
|
|
|
|
((char *) TwoPhaseState +
|
2005-10-15 04:49:52 +02:00
|
|
|
MAXALIGN(offsetof(TwoPhaseStateData, prepXacts) +
|
2005-06-18 00:32:51 +02:00
|
|
|
sizeof(GlobalTransaction) * max_prepared_xacts));
|
|
|
|
for (i = 0; i < max_prepared_xacts; i++)
|
|
|
|
{
|
2012-08-08 17:52:02 +02:00
|
|
|
/* insert into linked list */
|
2011-11-25 14:02:10 +01:00
|
|
|
gxacts[i].next = TwoPhaseState->freeGXacts;
|
2008-11-02 22:24:52 +01:00
|
|
|
TwoPhaseState->freeGXacts = &gxacts[i];
|
2009-11-23 10:58:36 +01:00
|
|
|
|
2012-08-08 17:52:02 +02:00
|
|
|
/* associate it with a PGPROC assigned by InitProcGlobal */
|
|
|
|
gxacts[i].pgprocno = PreparedXactProcs[i].pgprocno;
|
|
|
|
|
2009-11-23 10:58:36 +01:00
|
|
|
/*
|
|
|
|
* Assign a unique ID for each dummy proc, so that the range of
|
|
|
|
* dummy backend IDs immediately follows the range of normal
|
2010-02-26 03:01:40 +01:00
|
|
|
* backend IDs. We don't dare to assign a real backend ID to dummy
|
|
|
|
* procs, because prepared transactions don't take part in cache
|
|
|
|
* invalidation like a real backend ID would imply, but having a
|
|
|
|
* unique ID for them is nevertheless handy. This arrangement
|
|
|
|
* allows you to allocate an array of size (MaxBackends +
|
|
|
|
* max_prepared_xacts + 1), and have a slot for every backend and
|
|
|
|
* prepared transaction. Currently multixact.c uses that
|
|
|
|
* technique.
|
2009-11-23 10:58:36 +01:00
|
|
|
*/
|
|
|
|
gxacts[i].dummyBackendId = MaxBackends + 1 + i;
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
Assert(found);
|
|
|
|
}
|
|
|
|
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
/*
|
|
|
|
* Exit hook to unlock the global transaction entry we're working on.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
AtProcExit_Twophase(int code, Datum arg)
|
|
|
|
{
|
|
|
|
/* same logic as abort */
|
|
|
|
AtAbort_Twophase();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Abort hook to unlock the global transaction entry we're working on.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
AtAbort_Twophase(void)
|
|
|
|
{
|
|
|
|
if (MyLockedGxact == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
2015-05-24 03:35:49 +02:00
|
|
|
* What to do with the locked global transaction entry? If we were in the
|
|
|
|
* process of preparing the transaction, but haven't written the WAL
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
* record and state file yet, the transaction must not be considered as
|
|
|
|
* prepared. Likewise, if we are in the process of finishing an
|
2015-05-24 03:35:49 +02:00
|
|
|
* already-prepared transaction, and fail after having already written the
|
|
|
|
* 2nd phase commit or rollback record to the WAL, the transaction should
|
|
|
|
* not be considered as prepared anymore. In those cases, just remove the
|
|
|
|
* entry from shared memory.
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
*
|
2015-05-24 03:35:49 +02:00
|
|
|
* Otherwise, the entry must be left in place so that the transaction can
|
|
|
|
* be finished later, so just unlock it.
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
*
|
|
|
|
* If we abort during prepare, after having written the WAL record, we
|
2015-05-20 18:44:46 +02:00
|
|
|
* might not have transferred all locks and other state to the prepared
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
* transaction yet. Likewise, if we abort during commit or rollback,
|
2015-05-24 03:35:49 +02:00
|
|
|
* after having written the WAL record, we might not have released all the
|
|
|
|
* resources held by the transaction yet. In those cases, the in-memory
|
|
|
|
* state can be wrong, but it's too late to back out.
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
*/
|
|
|
|
if (!MyLockedGxact->valid)
|
|
|
|
{
|
|
|
|
RemoveGXact(MyLockedGxact);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
|
|
|
|
|
|
|
|
MyLockedGxact->locking_backend = InvalidBackendId;
|
|
|
|
|
|
|
|
LWLockRelease(TwoPhaseStateLock);
|
|
|
|
}
|
|
|
|
MyLockedGxact = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2015-04-26 18:42:31 +02:00
|
|
|
* This is called after we have finished transferring state to the prepared
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
* PGXACT entry.
|
|
|
|
*/
|
|
|
|
void
|
2015-08-15 17:25:00 +02:00
|
|
|
PostPrepare_Twophase(void)
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
{
|
|
|
|
LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
|
|
|
|
MyLockedGxact->locking_backend = InvalidBackendId;
|
|
|
|
LWLockRelease(TwoPhaseStateLock);
|
|
|
|
|
|
|
|
MyLockedGxact = NULL;
|
|
|
|
}
|
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* MarkAsPreparing
|
2005-10-15 04:49:52 +02:00
|
|
|
* Reserve the GID for the given transaction.
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
|
|
|
GlobalTransaction
|
2005-06-18 21:33:42 +02:00
|
|
|
MarkAsPreparing(TransactionId xid, const char *gid,
|
2005-06-28 07:09:14 +02:00
|
|
|
TimestampTz prepared_at, Oid owner, Oid databaseid)
|
2005-06-18 00:32:51 +02:00
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
GlobalTransaction gxact;
|
|
|
|
int i;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
if (strlen(gid) >= GIDSIZE)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
2005-10-14 00:55:55 +02:00
|
|
|
errmsg("transaction identifier \"%s\" is too long",
|
2005-06-18 00:32:51 +02:00
|
|
|
gid)));
|
|
|
|
|
2009-04-23 02:23:46 +02:00
|
|
|
/* fail immediately if feature is disabled */
|
|
|
|
if (max_prepared_xacts == 0)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
|
|
errmsg("prepared transactions are disabled"),
|
2009-06-11 16:49:15 +02:00
|
|
|
errhint("Set max_prepared_transactions to a nonzero value.")));
|
2009-04-23 02:23:46 +02:00
|
|
|
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
/* on first call, register the exit hook */
|
|
|
|
if (!twophaseExitRegistered)
|
2005-06-18 00:32:51 +02:00
|
|
|
{
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
before_shmem_exit(AtProcExit_Twophase, 0);
|
|
|
|
twophaseExitRegistered = true;
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
|
|
|
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
|
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
/* Check for conflicting GID */
|
|
|
|
for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
|
|
|
|
{
|
|
|
|
gxact = TwoPhaseState->prepXacts[i];
|
|
|
|
if (strcmp(gxact->gid, gid) == 0)
|
|
|
|
{
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_DUPLICATE_OBJECT),
|
2005-10-14 00:55:55 +02:00
|
|
|
errmsg("transaction identifier \"%s\" is already in use",
|
2005-06-18 00:32:51 +02:00
|
|
|
gid)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Get a free gxact from the freelist */
|
2008-11-02 22:24:52 +01:00
|
|
|
if (TwoPhaseState->freeGXacts == NULL)
|
2005-06-18 00:32:51 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_OUT_OF_MEMORY),
|
|
|
|
errmsg("maximum number of prepared transactions reached"),
|
|
|
|
errhint("Increase max_prepared_transactions (currently %d).",
|
|
|
|
max_prepared_xacts)));
|
2008-11-02 22:24:52 +01:00
|
|
|
gxact = TwoPhaseState->freeGXacts;
|
2012-08-08 17:52:02 +02:00
|
|
|
TwoPhaseState->freeGXacts = gxact->next;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
MarkAsPreparingGuts(gxact, xid, gid, prepared_at, owner, databaseid);
|
|
|
|
|
|
|
|
gxact->ondisk = false;
|
|
|
|
|
|
|
|
/* And insert it into the active array */
|
|
|
|
Assert(TwoPhaseState->numPrepXacts < max_prepared_xacts);
|
|
|
|
TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts++] = gxact;
|
|
|
|
|
|
|
|
LWLockRelease(TwoPhaseStateLock);
|
|
|
|
|
|
|
|
return gxact;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* MarkAsPreparingGuts
|
|
|
|
*
|
|
|
|
* This uses a gxact struct and puts it into the active array.
|
|
|
|
* NOTE: this is also used when reloading a gxact after a crash; so avoid
|
|
|
|
* assuming that we can use very much backend context.
|
|
|
|
*
|
|
|
|
* Note: This function should be called with appropriate locks held.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
|
|
|
|
TimestampTz prepared_at, Oid owner, Oid databaseid)
|
|
|
|
{
|
|
|
|
PGPROC *proc;
|
|
|
|
PGXACT *pgxact;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
Assert(gxact != NULL);
|
2011-11-25 14:02:10 +01:00
|
|
|
proc = &ProcGlobal->allProcs[gxact->pgprocno];
|
|
|
|
pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
|
|
|
|
|
|
|
|
/* Initialize the PGPROC entry */
|
|
|
|
MemSet(proc, 0, sizeof(PGPROC));
|
|
|
|
proc->pgprocno = gxact->pgprocno;
|
|
|
|
SHMQueueElemInit(&(proc->links));
|
|
|
|
proc->waitStatus = STATUS_OK;
|
2007-09-05 22:53:17 +02:00
|
|
|
/* We set up the gxact's VXID as InvalidBackendId/XID */
|
2011-11-25 14:02:10 +01:00
|
|
|
proc->lxid = (LocalTransactionId) xid;
|
|
|
|
pgxact->xid = xid;
|
|
|
|
pgxact->xmin = InvalidTransactionId;
|
2012-12-03 14:13:53 +01:00
|
|
|
pgxact->delayChkpt = false;
|
2011-11-25 14:02:10 +01:00
|
|
|
pgxact->vacuumFlags = 0;
|
|
|
|
proc->pid = 0;
|
|
|
|
proc->backendId = InvalidBackendId;
|
|
|
|
proc->databaseId = databaseid;
|
|
|
|
proc->roleId = owner;
|
2017-02-01 23:52:35 +01:00
|
|
|
proc->isBackgroundWorker = false;
|
2011-11-25 14:02:10 +01:00
|
|
|
proc->lwWaiting = false;
|
Make group commit more effective.
When a backend needs to flush the WAL, and someone else is already flushing
the WAL, wait until it releases the WALInsertLock and check if we still need
to do the flush or if the other backend already did the work for us, before
acquiring WALInsertLock. This helps group commit, because when the WAL flush
finishes, all the backends that were waiting for it can be woken up in one
go, and the can all concurrently observe that they're done, rather than
waking them up one by one in a cascading fashion.
This is based on a new LWLock function, LWLockWaitUntilFree(), which has
peculiar semantics. If the lock is immediately free, it grabs the lock and
returns true. If it's not free, it waits until it is released, but then
returns false without grabbing the lock. This is used in XLogFlush(), so
that when the lock is acquired, the backend flushes the WAL, but if it's
not, the backend first checks the current flush location before retrying.
Original patch and benchmarking by Peter Geoghegan and Simon Riggs, although
this patch as committed ended up being very different from that.
2012-01-30 15:40:58 +01:00
|
|
|
proc->lwWaitMode = 0;
|
2011-11-25 14:02:10 +01:00
|
|
|
proc->waitLock = NULL;
|
|
|
|
proc->waitProcLock = NULL;
|
2005-12-11 22:02:18 +01:00
|
|
|
for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
|
2011-11-25 14:02:10 +01:00
|
|
|
SHMQueueInit(&(proc->myProcLocks[i]));
|
2005-06-18 00:32:51 +02:00
|
|
|
/* subxid data must be filled later by GXactLoadSubxactData */
|
2011-11-25 14:02:10 +01:00
|
|
|
pgxact->overflowed = false;
|
|
|
|
pgxact->nxids = 0;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2005-06-18 21:33:42 +02:00
|
|
|
gxact->prepared_at = prepared_at;
|
2017-04-04 21:56:56 +02:00
|
|
|
gxact->xid = xid;
|
2005-06-18 00:32:51 +02:00
|
|
|
gxact->owner = owner;
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
gxact->locking_backend = MyBackendId;
|
2005-06-18 00:32:51 +02:00
|
|
|
gxact->valid = false;
|
2017-04-04 21:56:56 +02:00
|
|
|
gxact->inredo = false;
|
2005-06-18 00:32:51 +02:00
|
|
|
strcpy(gxact->gid, gid);
|
|
|
|
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
/*
|
2015-05-24 03:35:49 +02:00
|
|
|
* Remember that we have this GlobalTransaction entry locked for us. If we
|
|
|
|
* abort after this, we must release it.
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
*/
|
|
|
|
MyLockedGxact = gxact;
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* GXactLoadSubxactData
|
|
|
|
*
|
|
|
|
* If the transaction being persisted had any subtransactions, this must
|
|
|
|
* be called before MarkAsPrepared() to load information into the dummy
|
|
|
|
* PGPROC.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
GXactLoadSubxactData(GlobalTransaction gxact, int nsubxacts,
|
|
|
|
TransactionId *children)
|
|
|
|
{
|
2012-06-10 21:20:04 +02:00
|
|
|
PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno];
|
|
|
|
PGXACT *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
|
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
/* We need no extra lock since the GXACT isn't valid yet */
|
|
|
|
if (nsubxacts > PGPROC_MAX_CACHED_SUBXIDS)
|
|
|
|
{
|
2011-11-25 14:02:10 +01:00
|
|
|
pgxact->overflowed = true;
|
2005-06-18 00:32:51 +02:00
|
|
|
nsubxacts = PGPROC_MAX_CACHED_SUBXIDS;
|
|
|
|
}
|
|
|
|
if (nsubxacts > 0)
|
|
|
|
{
|
2011-11-25 14:02:10 +01:00
|
|
|
memcpy(proc->subxids.xids, children,
|
2005-06-18 00:32:51 +02:00
|
|
|
nsubxacts * sizeof(TransactionId));
|
2011-11-25 14:02:10 +01:00
|
|
|
pgxact->nxids = nsubxacts;
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* MarkAsPrepared
|
|
|
|
* Mark the GXACT as fully valid, and enter it into the global ProcArray.
|
|
|
|
*/
|
2005-06-19 22:00:39 +02:00
|
|
|
static void
|
2005-06-18 00:32:51 +02:00
|
|
|
MarkAsPrepared(GlobalTransaction gxact)
|
|
|
|
{
|
|
|
|
/* Lock here may be overkill, but I'm not convinced of that ... */
|
|
|
|
LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
|
|
|
|
Assert(!gxact->valid);
|
|
|
|
gxact->valid = true;
|
|
|
|
LWLockRelease(TwoPhaseStateLock);
|
|
|
|
|
|
|
|
/*
|
2007-09-21 18:32:19 +02:00
|
|
|
* Put it into the global ProcArray so TransactionIdIsInProgress considers
|
2005-06-18 00:32:51 +02:00
|
|
|
* the XID as still running.
|
|
|
|
*/
|
2011-11-25 14:02:10 +01:00
|
|
|
ProcArrayAdd(&ProcGlobal->allProcs[gxact->pgprocno]);
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* LockGXact
|
|
|
|
* Locate the prepared transaction and mark it busy for COMMIT or PREPARE.
|
|
|
|
*/
|
|
|
|
static GlobalTransaction
|
2005-06-28 07:09:14 +02:00
|
|
|
LockGXact(const char *gid, Oid user)
|
2005-06-18 00:32:51 +02:00
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
int i;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
/* on first call, register the exit hook */
|
|
|
|
if (!twophaseExitRegistered)
|
|
|
|
{
|
|
|
|
before_shmem_exit(AtProcExit_Twophase, 0);
|
|
|
|
twophaseExitRegistered = true;
|
|
|
|
}
|
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
|
|
|
|
|
|
|
|
for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
|
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
|
2012-06-10 21:20:04 +02:00
|
|
|
PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno];
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/* Ignore not-yet-valid GIDs */
|
|
|
|
if (!gxact->valid)
|
|
|
|
continue;
|
|
|
|
if (strcmp(gxact->gid, gid) != 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Found it, but has someone else got it locked? */
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
if (gxact->locking_backend != InvalidBackendId)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
2015-05-24 03:35:49 +02:00
|
|
|
errmsg("prepared transaction with identifier \"%s\" is busy",
|
|
|
|
gid)));
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
if (user != gxact->owner && !superuser_arg(user))
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
|
2005-10-15 04:49:52 +02:00
|
|
|
errmsg("permission denied to finish prepared transaction"),
|
2005-06-18 00:32:51 +02:00
|
|
|
errhint("Must be superuser or the user that prepared the transaction.")));
|
|
|
|
|
2007-02-13 20:39:42 +01:00
|
|
|
/*
|
2007-11-15 22:14:46 +01:00
|
|
|
* Note: it probably would be possible to allow committing from
|
|
|
|
* another database; but at the moment NOTIFY is known not to work and
|
2014-05-06 18:12:18 +02:00
|
|
|
* there may be some other issues as well. Hence disallow until
|
2007-11-15 22:14:46 +01:00
|
|
|
* someone gets motivated to make it work.
|
2007-02-13 20:39:42 +01:00
|
|
|
*/
|
2011-11-25 14:02:10 +01:00
|
|
|
if (MyDatabaseId != proc->databaseId)
|
2007-02-13 20:39:42 +01:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
2007-11-15 22:14:46 +01:00
|
|
|
errmsg("prepared transaction belongs to another database"),
|
2007-02-13 20:39:42 +01:00
|
|
|
errhint("Connect to the database where the transaction was prepared to finish it.")));
|
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
/* OK for me to lock it */
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
gxact->locking_backend = MyBackendId;
|
|
|
|
MyLockedGxact = gxact;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
LWLockRelease(TwoPhaseStateLock);
|
|
|
|
|
|
|
|
return gxact;
|
|
|
|
}
|
|
|
|
|
|
|
|
LWLockRelease(TwoPhaseStateLock);
|
|
|
|
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_UNDEFINED_OBJECT),
|
2005-10-15 04:49:52 +02:00
|
|
|
errmsg("prepared transaction with identifier \"%s\" does not exist",
|
|
|
|
gid)));
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/* NOTREACHED */
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* RemoveGXact
|
|
|
|
* Remove the prepared transaction from the shared memory array.
|
|
|
|
*
|
|
|
|
* NB: caller should have already removed it from ProcArray
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
RemoveGXact(GlobalTransaction gxact)
|
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
int i;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
|
|
|
|
|
|
|
|
for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
|
|
|
|
{
|
|
|
|
if (gxact == TwoPhaseState->prepXacts[i])
|
|
|
|
{
|
|
|
|
/* remove from the active array */
|
|
|
|
TwoPhaseState->numPrepXacts--;
|
|
|
|
TwoPhaseState->prepXacts[i] = TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts];
|
|
|
|
|
|
|
|
/* and put it back in the freelist */
|
2011-11-25 14:02:10 +01:00
|
|
|
gxact->next = TwoPhaseState->freeGXacts;
|
2008-11-02 22:24:52 +01:00
|
|
|
TwoPhaseState->freeGXacts = gxact;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
LWLockRelease(TwoPhaseStateLock);
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
LWLockRelease(TwoPhaseStateLock);
|
|
|
|
|
|
|
|
elog(ERROR, "failed to find %p in GlobalTransaction array", gxact);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Returns an array of all prepared transactions for the user-level
|
|
|
|
* function pg_prepared_xact.
|
|
|
|
*
|
|
|
|
* The returned array and all its elements are copies of internal data
|
|
|
|
* structures, to minimize the time we need to hold the TwoPhaseStateLock.
|
|
|
|
*
|
|
|
|
* WARNING -- we return even those transactions that are not fully prepared
|
|
|
|
* yet. The caller should filter them out if he doesn't want them.
|
|
|
|
*
|
|
|
|
* The returned array is palloc'd.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
GetPreparedTransactionList(GlobalTransaction *gxacts)
|
|
|
|
{
|
|
|
|
GlobalTransaction array;
|
2005-10-15 04:49:52 +02:00
|
|
|
int num;
|
|
|
|
int i;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
|
|
|
|
|
|
|
|
if (TwoPhaseState->numPrepXacts == 0)
|
|
|
|
{
|
|
|
|
LWLockRelease(TwoPhaseStateLock);
|
|
|
|
|
|
|
|
*gxacts = NULL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
num = TwoPhaseState->numPrepXacts;
|
|
|
|
array = (GlobalTransaction) palloc(sizeof(GlobalTransactionData) * num);
|
|
|
|
*gxacts = array;
|
|
|
|
for (i = 0; i < num; i++)
|
|
|
|
memcpy(array + i, TwoPhaseState->prepXacts[i],
|
|
|
|
sizeof(GlobalTransactionData));
|
|
|
|
|
|
|
|
LWLockRelease(TwoPhaseStateLock);
|
|
|
|
|
|
|
|
return num;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Working status for pg_prepared_xact */
|
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
GlobalTransaction array;
|
2005-10-15 04:49:52 +02:00
|
|
|
int ngxacts;
|
|
|
|
int currIdx;
|
2005-06-18 00:32:51 +02:00
|
|
|
} Working_State;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pg_prepared_xact
|
2005-10-15 04:49:52 +02:00
|
|
|
* Produce a view with one row per prepared transaction.
|
2005-06-18 00:32:51 +02:00
|
|
|
*
|
|
|
|
* This function is here so we don't have to export the
|
|
|
|
* GlobalTransactionData struct definition.
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
pg_prepared_xact(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
FuncCallContext *funcctx;
|
|
|
|
Working_State *status;
|
|
|
|
|
|
|
|
if (SRF_IS_FIRSTCALL())
|
|
|
|
{
|
|
|
|
TupleDesc tupdesc;
|
|
|
|
MemoryContext oldcontext;
|
|
|
|
|
|
|
|
/* create a function context for cross-call persistence */
|
|
|
|
funcctx = SRF_FIRSTCALL_INIT();
|
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Switch to memory context appropriate for multiple function calls
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
|
|
|
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
|
|
|
|
|
|
|
|
/* build tupdesc for result tuples */
|
|
|
|
/* this had better match pg_prepared_xacts view in system_views.sql */
|
2005-06-18 21:33:42 +02:00
|
|
|
tupdesc = CreateTemplateTupleDesc(5, false);
|
2005-06-18 00:32:51 +02:00
|
|
|
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "transaction",
|
|
|
|
XIDOID, -1, 0);
|
|
|
|
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "gid",
|
|
|
|
TEXTOID, -1, 0);
|
2005-06-18 21:33:42 +02:00
|
|
|
TupleDescInitEntry(tupdesc, (AttrNumber) 3, "prepared",
|
|
|
|
TIMESTAMPTZOID, -1, 0);
|
|
|
|
TupleDescInitEntry(tupdesc, (AttrNumber) 4, "ownerid",
|
2005-06-28 07:09:14 +02:00
|
|
|
OIDOID, -1, 0);
|
2005-06-18 21:33:42 +02:00
|
|
|
TupleDescInitEntry(tupdesc, (AttrNumber) 5, "dbid",
|
2005-06-18 00:32:51 +02:00
|
|
|
OIDOID, -1, 0);
|
|
|
|
|
|
|
|
funcctx->tuple_desc = BlessTupleDesc(tupdesc);
|
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Collect all the 2PC status information that we will format and send
|
|
|
|
* out as a result set.
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
|
|
|
status = (Working_State *) palloc(sizeof(Working_State));
|
|
|
|
funcctx->user_fctx = (void *) status;
|
|
|
|
|
|
|
|
status->ngxacts = GetPreparedTransactionList(&status->array);
|
|
|
|
status->currIdx = 0;
|
|
|
|
|
|
|
|
MemoryContextSwitchTo(oldcontext);
|
|
|
|
}
|
|
|
|
|
|
|
|
funcctx = SRF_PERCALL_SETUP();
|
|
|
|
status = (Working_State *) funcctx->user_fctx;
|
|
|
|
|
|
|
|
while (status->array != NULL && status->currIdx < status->ngxacts)
|
|
|
|
{
|
|
|
|
GlobalTransaction gxact = &status->array[status->currIdx++];
|
2012-06-10 21:20:04 +02:00
|
|
|
PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno];
|
|
|
|
PGXACT *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
|
2005-06-18 21:33:42 +02:00
|
|
|
Datum values[5];
|
|
|
|
bool nulls[5];
|
2005-06-18 00:32:51 +02:00
|
|
|
HeapTuple tuple;
|
|
|
|
Datum result;
|
|
|
|
|
|
|
|
if (!gxact->valid)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Form tuple with appropriate data.
|
|
|
|
*/
|
|
|
|
MemSet(values, 0, sizeof(values));
|
|
|
|
MemSet(nulls, 0, sizeof(nulls));
|
|
|
|
|
2011-11-25 14:02:10 +01:00
|
|
|
values[0] = TransactionIdGetDatum(pgxact->xid);
|
2008-03-25 23:42:46 +01:00
|
|
|
values[1] = CStringGetTextDatum(gxact->gid);
|
2005-06-18 21:33:42 +02:00
|
|
|
values[2] = TimestampTzGetDatum(gxact->prepared_at);
|
2005-06-28 07:09:14 +02:00
|
|
|
values[3] = ObjectIdGetDatum(gxact->owner);
|
2011-11-25 14:02:10 +01:00
|
|
|
values[4] = ObjectIdGetDatum(proc->databaseId);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
|
|
|
|
result = HeapTupleGetDatum(tuple);
|
|
|
|
SRF_RETURN_NEXT(funcctx, result);
|
|
|
|
}
|
|
|
|
|
|
|
|
SRF_RETURN_DONE(funcctx);
|
|
|
|
}
|
|
|
|
|
2009-11-23 10:58:36 +01:00
|
|
|
/*
|
2012-08-08 17:52:02 +02:00
|
|
|
* TwoPhaseGetGXact
|
|
|
|
* Get the GlobalTransaction struct for a prepared transaction
|
|
|
|
* specified by XID
|
2009-11-23 10:58:36 +01:00
|
|
|
*/
|
2012-08-08 17:52:02 +02:00
|
|
|
static GlobalTransaction
|
|
|
|
TwoPhaseGetGXact(TransactionId xid)
|
2005-06-18 00:32:51 +02:00
|
|
|
{
|
2012-08-08 17:52:02 +02:00
|
|
|
GlobalTransaction result = NULL;
|
2005-06-18 00:32:51 +02:00
|
|
|
int i;
|
|
|
|
|
|
|
|
static TransactionId cached_xid = InvalidTransactionId;
|
2012-08-08 17:52:02 +02:00
|
|
|
static GlobalTransaction cached_gxact = NULL;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* During a recovery, COMMIT PREPARED, or ABORT PREPARED, we'll be called
|
|
|
|
* repeatedly for the same XID. We can save work with a simple cache.
|
|
|
|
*/
|
|
|
|
if (xid == cached_xid)
|
2012-08-08 17:52:02 +02:00
|
|
|
return cached_gxact;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
|
|
|
|
|
|
|
|
for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
|
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
|
2012-06-10 21:20:04 +02:00
|
|
|
PGXACT *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2011-11-25 14:02:10 +01:00
|
|
|
if (pgxact->xid == xid)
|
2005-06-18 00:32:51 +02:00
|
|
|
{
|
2012-08-08 17:52:02 +02:00
|
|
|
result = gxact;
|
2005-06-18 00:32:51 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
LWLockRelease(TwoPhaseStateLock);
|
|
|
|
|
|
|
|
if (result == NULL) /* should not happen */
|
2012-08-08 17:52:02 +02:00
|
|
|
elog(ERROR, "failed to find GlobalTransaction for xid %u", xid);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
cached_xid = xid;
|
2012-08-08 17:52:02 +02:00
|
|
|
cached_gxact = result;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2012-08-08 17:52:02 +02:00
|
|
|
/*
|
|
|
|
* TwoPhaseGetDummyProc
|
|
|
|
* Get the dummy backend ID for prepared transaction specified by XID
|
|
|
|
*
|
|
|
|
* Dummy backend IDs are similar to real backend IDs of real backends.
|
|
|
|
* They start at MaxBackends + 1, and are unique across all currently active
|
|
|
|
* real backends and prepared transactions.
|
|
|
|
*/
|
|
|
|
BackendId
|
|
|
|
TwoPhaseGetDummyBackendId(TransactionId xid)
|
|
|
|
{
|
|
|
|
GlobalTransaction gxact = TwoPhaseGetGXact(xid);
|
|
|
|
|
|
|
|
return gxact->dummyBackendId;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* TwoPhaseGetDummyProc
|
|
|
|
* Get the PGPROC that represents a prepared transaction specified by XID
|
|
|
|
*/
|
|
|
|
PGPROC *
|
|
|
|
TwoPhaseGetDummyProc(TransactionId xid)
|
|
|
|
{
|
|
|
|
GlobalTransaction gxact = TwoPhaseGetGXact(xid);
|
|
|
|
|
|
|
|
return &ProcGlobal->allProcs[gxact->pgprocno];
|
|
|
|
}
|
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
/************************************************************************/
|
2005-10-15 04:49:52 +02:00
|
|
|
/* State file support */
|
2005-06-18 00:32:51 +02:00
|
|
|
/************************************************************************/
|
|
|
|
|
|
|
|
#define TwoPhaseFilePath(path, xid) \
|
2005-07-04 06:51:52 +02:00
|
|
|
snprintf(path, MAXPGPATH, TWOPHASE_DIR "/%08X", xid)
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* 2PC state file format:
|
|
|
|
*
|
2005-10-15 04:49:52 +02:00
|
|
|
* 1. TwoPhaseFileHeader
|
|
|
|
* 2. TransactionId[] (subtransactions)
|
2008-11-19 11:34:52 +01:00
|
|
|
* 3. RelFileNode[] (files to be deleted at commit)
|
|
|
|
* 4. RelFileNode[] (files to be deleted at abort)
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
* 5. SharedInvalidationMessage[] (inval messages to be sent at commit)
|
|
|
|
* 6. TwoPhaseRecordOnDisk
|
|
|
|
* 7. ...
|
|
|
|
* 8. TwoPhaseRecordOnDisk (end sentinel, rmid == TWOPHASE_RM_END_ID)
|
Switch to CRC-32C in WAL and other places.
The old algorithm was found to not be the usual CRC-32 algorithm, used by
Ethernet et al. We were using a non-reflected lookup table with code meant
for a reflected lookup table. That's a strange combination that AFAICS does
not correspond to any bit-wise CRC calculation, which makes it difficult to
reason about its properties. Although it has worked well in practice, seems
safer to use a well-known algorithm.
Since we're changing the algorithm anyway, we might as well choose a
different polynomial. The Castagnoli polynomial has better error-correcting
properties than the traditional CRC-32 polynomial, even if we had
implemented it correctly. Another reason for picking that is that some new
CPUs have hardware support for calculating CRC-32C, but not CRC-32, let
alone our strange variant of it. This patch doesn't add any support for such
hardware, but a future patch could now do that.
The old algorithm is kept around for tsquery and pg_trgm, which use the
values in indexes that need to remain compatible so that pg_upgrade works.
While we're at it, share the old lookup table for CRC-32 calculation
between hstore, ltree and core. They all use the same table, so might as
well.
2014-11-04 10:35:15 +01:00
|
|
|
* 9. checksum (CRC-32C)
|
2005-06-18 00:32:51 +02:00
|
|
|
*
|
Switch to CRC-32C in WAL and other places.
The old algorithm was found to not be the usual CRC-32 algorithm, used by
Ethernet et al. We were using a non-reflected lookup table with code meant
for a reflected lookup table. That's a strange combination that AFAICS does
not correspond to any bit-wise CRC calculation, which makes it difficult to
reason about its properties. Although it has worked well in practice, seems
safer to use a well-known algorithm.
Since we're changing the algorithm anyway, we might as well choose a
different polynomial. The Castagnoli polynomial has better error-correcting
properties than the traditional CRC-32 polynomial, even if we had
implemented it correctly. Another reason for picking that is that some new
CPUs have hardware support for calculating CRC-32C, but not CRC-32, let
alone our strange variant of it. This patch doesn't add any support for such
hardware, but a future patch could now do that.
The old algorithm is kept around for tsquery and pg_trgm, which use the
values in indexes that need to remain compatible so that pg_upgrade works.
While we're at it, share the old lookup table for CRC-32 calculation
between hstore, ltree and core. They all use the same table, so might as
well.
2014-11-04 10:35:15 +01:00
|
|
|
* Each segment except the final checksum is MAXALIGN'd.
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Header for a 2PC state file
|
|
|
|
*/
|
2016-03-10 13:51:46 +01:00
|
|
|
#define TWOPHASE_MAGIC 0x57F94533 /* format identifier */
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
typedef struct TwoPhaseFileHeader
|
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
uint32 magic; /* format identifier */
|
|
|
|
uint32 total_len; /* actual file length */
|
|
|
|
TransactionId xid; /* original transaction XID */
|
|
|
|
Oid database; /* OID of database it was in */
|
|
|
|
TimestampTz prepared_at; /* time of preparation */
|
|
|
|
Oid owner; /* user running the transaction */
|
|
|
|
int32 nsubxacts; /* number of following subxact XIDs */
|
|
|
|
int32 ncommitrels; /* number of delete-on-commit rels */
|
|
|
|
int32 nabortrels; /* number of delete-on-abort rels */
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
int32 ninvalmsgs; /* number of cache invalidation messages */
|
|
|
|
bool initfileinval; /* does relcache init file need invalidation? */
|
2016-03-10 13:51:46 +01:00
|
|
|
uint16 gidlen; /* length of the GID - GID follows the header */
|
2005-06-18 00:32:51 +02:00
|
|
|
} TwoPhaseFileHeader;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Header for each record in a state file
|
|
|
|
*
|
|
|
|
* NOTE: len counts only the rmgr data, not the TwoPhaseRecordOnDisk header.
|
|
|
|
* The rmgr data will be stored starting on a MAXALIGN boundary.
|
|
|
|
*/
|
|
|
|
typedef struct TwoPhaseRecordOnDisk
|
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
uint32 len; /* length of rmgr data */
|
|
|
|
TwoPhaseRmgrId rmid; /* resource manager for this record */
|
|
|
|
uint16 info; /* flag bits for use by rmgr */
|
2005-06-18 00:32:51 +02:00
|
|
|
} TwoPhaseRecordOnDisk;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* During prepare, the state file is assembled in memory before writing it
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
* to WAL and the actual state file. We use a chain of StateFileChunk blocks
|
|
|
|
* for that.
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
typedef struct StateFileChunk
|
|
|
|
{
|
|
|
|
char *data;
|
|
|
|
uint32 len;
|
|
|
|
struct StateFileChunk *next;
|
|
|
|
} StateFileChunk;
|
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
static struct xllist
|
|
|
|
{
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
StateFileChunk *head; /* first data block in the chain */
|
|
|
|
StateFileChunk *tail; /* last block in chain */
|
|
|
|
uint32 num_chunks;
|
2005-10-15 04:49:52 +02:00
|
|
|
uint32 bytes_free; /* free bytes left in tail block */
|
|
|
|
uint32 total_len; /* total data bytes in chain */
|
|
|
|
} records;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Append a block of data to records data structure.
|
|
|
|
*
|
|
|
|
* NB: each block is padded to a MAXALIGN multiple. This must be
|
|
|
|
* accounted for when the file is later read!
|
|
|
|
*
|
|
|
|
* The data is copied, so the caller is free to modify it afterwards.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
save_state_data(const void *data, uint32 len)
|
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
uint32 padlen = MAXALIGN(len);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
if (padlen > records.bytes_free)
|
|
|
|
{
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
records.tail->next = palloc0(sizeof(StateFileChunk));
|
2005-06-18 00:32:51 +02:00
|
|
|
records.tail = records.tail->next;
|
|
|
|
records.tail->len = 0;
|
|
|
|
records.tail->next = NULL;
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
records.num_chunks++;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
records.bytes_free = Max(padlen, 512);
|
|
|
|
records.tail->data = palloc(records.bytes_free);
|
|
|
|
}
|
|
|
|
|
|
|
|
memcpy(((char *) records.tail->data) + records.tail->len, data, len);
|
|
|
|
records.tail->len += padlen;
|
|
|
|
records.bytes_free -= padlen;
|
|
|
|
records.total_len += padlen;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Start preparing a state file.
|
|
|
|
*
|
|
|
|
* Initializes data structure and inserts the 2PC file header record.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
StartPrepare(GlobalTransaction gxact)
|
|
|
|
{
|
2012-06-10 21:20:04 +02:00
|
|
|
PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno];
|
|
|
|
PGXACT *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
|
2011-11-25 14:02:10 +01:00
|
|
|
TransactionId xid = pgxact->xid;
|
2005-06-18 00:32:51 +02:00
|
|
|
TwoPhaseFileHeader hdr;
|
|
|
|
TransactionId *children;
|
2008-11-19 11:34:52 +01:00
|
|
|
RelFileNode *commitrels;
|
|
|
|
RelFileNode *abortrels;
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
SharedInvalidationMessage *invalmsgs;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/* Initialize linked list */
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
records.head = palloc0(sizeof(StateFileChunk));
|
2005-06-18 00:32:51 +02:00
|
|
|
records.head->len = 0;
|
|
|
|
records.head->next = NULL;
|
|
|
|
|
|
|
|
records.bytes_free = Max(sizeof(TwoPhaseFileHeader), 512);
|
|
|
|
records.head->data = palloc(records.bytes_free);
|
|
|
|
|
|
|
|
records.tail = records.head;
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
records.num_chunks = 1;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
records.total_len = 0;
|
|
|
|
|
|
|
|
/* Create header */
|
|
|
|
hdr.magic = TWOPHASE_MAGIC;
|
|
|
|
hdr.total_len = 0; /* EndPrepare will fill this in */
|
|
|
|
hdr.xid = xid;
|
2011-11-25 14:02:10 +01:00
|
|
|
hdr.database = proc->databaseId;
|
2005-06-18 21:33:42 +02:00
|
|
|
hdr.prepared_at = gxact->prepared_at;
|
|
|
|
hdr.owner = gxact->owner;
|
2005-06-18 00:32:51 +02:00
|
|
|
hdr.nsubxacts = xactGetCommittedChildren(&children);
|
2010-08-13 22:10:54 +02:00
|
|
|
hdr.ncommitrels = smgrGetPendingDeletes(true, &commitrels);
|
|
|
|
hdr.nabortrels = smgrGetPendingDeletes(false, &abortrels);
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
hdr.ninvalmsgs = xactGetCommittedInvalidationMessages(&invalmsgs,
|
|
|
|
&hdr.initfileinval);
|
2016-06-10 00:02:36 +02:00
|
|
|
hdr.gidlen = strlen(gxact->gid) + 1; /* Include '\0' */
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
save_state_data(&hdr, sizeof(TwoPhaseFileHeader));
|
2016-03-10 13:51:46 +01:00
|
|
|
save_state_data(gxact->gid, hdr.gidlen);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
/*
|
2010-02-26 03:01:40 +01:00
|
|
|
* Add the additional info about subxacts, deletable files and cache
|
|
|
|
* invalidation messages.
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
*/
|
2005-06-18 00:32:51 +02:00
|
|
|
if (hdr.nsubxacts > 0)
|
|
|
|
{
|
|
|
|
save_state_data(children, hdr.nsubxacts * sizeof(TransactionId));
|
|
|
|
/* While we have the child-xact data, stuff it in the gxact too */
|
|
|
|
GXactLoadSubxactData(gxact, hdr.nsubxacts, children);
|
|
|
|
}
|
|
|
|
if (hdr.ncommitrels > 0)
|
|
|
|
{
|
2008-11-19 11:34:52 +01:00
|
|
|
save_state_data(commitrels, hdr.ncommitrels * sizeof(RelFileNode));
|
2005-06-18 00:32:51 +02:00
|
|
|
pfree(commitrels);
|
|
|
|
}
|
|
|
|
if (hdr.nabortrels > 0)
|
|
|
|
{
|
2008-11-19 11:34:52 +01:00
|
|
|
save_state_data(abortrels, hdr.nabortrels * sizeof(RelFileNode));
|
2005-06-18 00:32:51 +02:00
|
|
|
pfree(abortrels);
|
|
|
|
}
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
if (hdr.ninvalmsgs > 0)
|
|
|
|
{
|
|
|
|
save_state_data(invalmsgs,
|
|
|
|
hdr.ninvalmsgs * sizeof(SharedInvalidationMessage));
|
|
|
|
pfree(invalmsgs);
|
|
|
|
}
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2016-01-21 03:40:44 +01:00
|
|
|
* Finish preparing state data and writing it to WAL.
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
EndPrepare(GlobalTransaction gxact)
|
|
|
|
{
|
|
|
|
TwoPhaseFileHeader *hdr;
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
StateFileChunk *record;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/* Add the end sentinel to the list of 2PC records */
|
|
|
|
RegisterTwoPhaseRecord(TWOPHASE_RM_END_ID, 0,
|
|
|
|
NULL, 0);
|
|
|
|
|
|
|
|
/* Go back and fill in total_len in the file header record */
|
|
|
|
hdr = (TwoPhaseFileHeader *) records.head->data;
|
|
|
|
Assert(hdr->magic == TWOPHASE_MAGIC);
|
2015-04-14 16:03:42 +02:00
|
|
|
hdr->total_len = records.total_len + sizeof(pg_crc32c);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2008-05-19 20:16:26 +02:00
|
|
|
/*
|
2016-01-21 03:40:44 +01:00
|
|
|
* If the data size exceeds MaxAllocSize, we won't be able to read it in
|
|
|
|
* ReadTwoPhaseFile. Check for that now, rather than fail in the case
|
|
|
|
* where we write data to file and then re-read at commit time.
|
2008-05-19 20:16:26 +02:00
|
|
|
*/
|
|
|
|
if (hdr->total_len > MaxAllocSize)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
|
|
errmsg("two-phase state file maximum length exceeded")));
|
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
/*
|
2016-01-21 03:40:44 +01:00
|
|
|
* Now writing 2PC state data to WAL. We let the WAL's CRC protection
|
|
|
|
* cover us, so no need to calculate a separate CRC.
|
2005-06-18 00:32:51 +02:00
|
|
|
*
|
2012-12-03 14:13:53 +01:00
|
|
|
* We have to set delayChkpt here, too; otherwise a checkpoint starting
|
2007-11-15 22:14:46 +01:00
|
|
|
* immediately after the WAL record is inserted could complete without
|
|
|
|
* fsync'ing our state file. (This is essentially the same kind of race
|
|
|
|
* condition as the COMMIT-to-clog-write case that RecordTransactionCommit
|
2012-12-03 14:13:53 +01:00
|
|
|
* uses delayChkpt for; see notes there.)
|
2005-06-19 22:00:39 +02:00
|
|
|
*
|
|
|
|
* We save the PREPARE record's location in the gxact for later use by
|
|
|
|
* CheckPointTwoPhase.
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogEnsureRecordSpace(0, records.num_chunks);
|
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
START_CRIT_SECTION();
|
|
|
|
|
2012-12-03 14:13:53 +01:00
|
|
|
MyPgXact->delayChkpt = true;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogBeginInsert();
|
|
|
|
for (record = records.head; record != NULL; record = record->next)
|
|
|
|
XLogRegisterData(record->data, record->len);
|
2016-01-21 03:40:44 +01:00
|
|
|
gxact->prepare_end_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE);
|
|
|
|
XLogFlush(gxact->prepare_end_lsn);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/* If we crash now, we have prepared: WAL replay will fix things */
|
|
|
|
|
2016-01-21 03:40:44 +01:00
|
|
|
/* Store record's start location to read that later on Commit */
|
|
|
|
gxact->prepare_start_lsn = ProcLastRecPtr;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2005-06-19 22:00:39 +02:00
|
|
|
/*
|
2014-05-06 18:12:18 +02:00
|
|
|
* Mark the prepared transaction as valid. As soon as xact.c marks
|
2012-06-10 21:20:04 +02:00
|
|
|
* MyPgXact as not running our XID (which it will do immediately after
|
|
|
|
* this function returns), others can commit/rollback the xact.
|
2005-06-19 22:00:39 +02:00
|
|
|
*
|
|
|
|
* NB: a side effect of this is to make a dummy ProcArray entry for the
|
2012-05-14 09:22:44 +02:00
|
|
|
* prepared XID. This must happen before we clear the XID from MyPgXact,
|
2005-06-19 22:00:39 +02:00
|
|
|
* else there is a window where the XID is not running according to
|
2007-09-21 18:32:19 +02:00
|
|
|
* TransactionIdIsInProgress, and onlookers would be entitled to assume
|
|
|
|
* the xact crashed. Instead we have a window where the same XID appears
|
2005-10-15 04:49:52 +02:00
|
|
|
* twice in ProcArray, which is OK.
|
2005-06-19 22:00:39 +02:00
|
|
|
*/
|
|
|
|
MarkAsPrepared(gxact);
|
|
|
|
|
|
|
|
/*
|
2007-11-15 22:14:46 +01:00
|
|
|
* Now we can mark ourselves as out of the commit critical section: a
|
|
|
|
* checkpoint starting after this will certainly see the gxact as a
|
2007-04-03 18:34:36 +02:00
|
|
|
* candidate for fsyncing.
|
2005-06-19 22:00:39 +02:00
|
|
|
*/
|
2012-12-03 14:13:53 +01:00
|
|
|
MyPgXact->delayChkpt = false;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
/*
|
|
|
|
* Remember that we have this GlobalTransaction entry locked for us. If
|
|
|
|
* we crash after this point, it's too late to abort, but we must unlock
|
|
|
|
* it so that the prepared transaction can be committed or rolled back.
|
|
|
|
*/
|
|
|
|
MyLockedGxact = gxact;
|
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
END_CRIT_SECTION();
|
|
|
|
|
2011-03-06 23:49:16 +01:00
|
|
|
/*
|
|
|
|
* Wait for synchronous replication, if required.
|
|
|
|
*
|
|
|
|
* Note that at this stage we have marked the prepare, but still show as
|
|
|
|
* running in the procarray (twice!) and continue to hold locks.
|
|
|
|
*/
|
2016-03-30 03:16:12 +02:00
|
|
|
SyncRepWaitForLSN(gxact->prepare_end_lsn, false);
|
2011-03-06 23:49:16 +01:00
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
records.tail = records.head = NULL;
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
records.num_chunks = 0;
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Register a 2PC record to be written to state file.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info,
|
|
|
|
const void *data, uint32 len)
|
|
|
|
{
|
|
|
|
TwoPhaseRecordOnDisk record;
|
|
|
|
|
|
|
|
record.rmid = rmid;
|
|
|
|
record.info = info;
|
|
|
|
record.len = len;
|
|
|
|
save_state_data(&record, sizeof(TwoPhaseRecordOnDisk));
|
|
|
|
if (len > 0)
|
|
|
|
save_state_data(data, len);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read and validate the state file for xid.
|
|
|
|
*
|
|
|
|
* If it looks OK (has a valid magic number and CRC), return the palloc'd
|
|
|
|
* contents of the file. Otherwise return NULL.
|
|
|
|
*/
|
|
|
|
static char *
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
|
2005-06-18 00:32:51 +02:00
|
|
|
{
|
|
|
|
char path[MAXPGPATH];
|
|
|
|
char *buf;
|
|
|
|
TwoPhaseFileHeader *hdr;
|
|
|
|
int fd;
|
2005-10-15 04:49:52 +02:00
|
|
|
struct stat stat;
|
2005-06-18 00:32:51 +02:00
|
|
|
uint32 crc_offset;
|
2015-04-14 16:03:42 +02:00
|
|
|
pg_crc32c calc_crc,
|
2005-10-15 04:49:52 +02:00
|
|
|
file_crc;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
TwoPhaseFilePath(path, xid);
|
|
|
|
|
Add OpenTransientFile, with automatic cleanup at end-of-xact.
Files opened with BasicOpenFile or PathNameOpenFile are not automatically
cleaned up on error. That puts unnecessary burden on callers that only want
to keep the file open for a short time. There is AllocateFile, but that
returns a buffered FILE * stream, which in many cases is not the nicest API
to work with. So add function called OpenTransientFile, which returns a
unbuffered fd that's cleaned up like the FILE* returned by AllocateFile().
This plugs a few rare fd leaks in error cases:
1. copy_file() - fixed by by using OpenTransientFile instead of BasicOpenFile
2. XLogFileInit() - fixed by adding close() calls to the error cases. Can't
use OpenTransientFile here because the fd is supposed to persist over
transaction boundaries.
3. lo_import/lo_export - fixed by using OpenTransientFile instead of
PathNameOpenFile.
In addition to plugging those leaks, this replaces many BasicOpenFile() calls
with OpenTransientFile() that were not leaking, because the code meticulously
closed the file on error. That wasn't strictly necessary, but IMHO it's good
for robustness.
The same leaks exist in older versions, but given the rarity of the issues,
I'm not backpatching this. Not yet, anyway - it might be good to backpatch
later, after this mechanism has had some more testing in master branch.
2012-11-27 09:25:50 +01:00
|
|
|
fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
|
2005-06-18 00:32:51 +02:00
|
|
|
if (fd < 0)
|
|
|
|
{
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
if (give_warnings)
|
|
|
|
ereport(WARNING,
|
|
|
|
(errcode_for_file_access(),
|
|
|
|
errmsg("could not open two-phase state file \"%s\": %m",
|
|
|
|
path)));
|
2005-06-18 00:32:51 +02:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Check file length. We can determine a lower bound pretty easily. We
|
2008-05-19 20:16:26 +02:00
|
|
|
* set an upper bound to avoid palloc() failure on a corrupt file, though
|
|
|
|
* we can't guarantee that we won't get an out of memory error anyway,
|
|
|
|
* even on a valid file.
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
|
|
|
if (fstat(fd, &stat))
|
|
|
|
{
|
Add OpenTransientFile, with automatic cleanup at end-of-xact.
Files opened with BasicOpenFile or PathNameOpenFile are not automatically
cleaned up on error. That puts unnecessary burden on callers that only want
to keep the file open for a short time. There is AllocateFile, but that
returns a buffered FILE * stream, which in many cases is not the nicest API
to work with. So add function called OpenTransientFile, which returns a
unbuffered fd that's cleaned up like the FILE* returned by AllocateFile().
This plugs a few rare fd leaks in error cases:
1. copy_file() - fixed by by using OpenTransientFile instead of BasicOpenFile
2. XLogFileInit() - fixed by adding close() calls to the error cases. Can't
use OpenTransientFile here because the fd is supposed to persist over
transaction boundaries.
3. lo_import/lo_export - fixed by using OpenTransientFile instead of
PathNameOpenFile.
In addition to plugging those leaks, this replaces many BasicOpenFile() calls
with OpenTransientFile() that were not leaking, because the code meticulously
closed the file on error. That wasn't strictly necessary, but IMHO it's good
for robustness.
The same leaks exist in older versions, but given the rarity of the issues,
I'm not backpatching this. Not yet, anyway - it might be good to backpatch
later, after this mechanism has had some more testing in master branch.
2012-11-27 09:25:50 +01:00
|
|
|
CloseTransientFile(fd);
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
if (give_warnings)
|
|
|
|
ereport(WARNING,
|
|
|
|
(errcode_for_file_access(),
|
|
|
|
errmsg("could not stat two-phase state file \"%s\": %m",
|
|
|
|
path)));
|
2005-06-18 00:32:51 +02:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (stat.st_size < (MAXALIGN(sizeof(TwoPhaseFileHeader)) +
|
|
|
|
MAXALIGN(sizeof(TwoPhaseRecordOnDisk)) +
|
2015-04-14 16:03:42 +02:00
|
|
|
sizeof(pg_crc32c)) ||
|
2008-05-19 20:16:26 +02:00
|
|
|
stat.st_size > MaxAllocSize)
|
2005-06-18 00:32:51 +02:00
|
|
|
{
|
Add OpenTransientFile, with automatic cleanup at end-of-xact.
Files opened with BasicOpenFile or PathNameOpenFile are not automatically
cleaned up on error. That puts unnecessary burden on callers that only want
to keep the file open for a short time. There is AllocateFile, but that
returns a buffered FILE * stream, which in many cases is not the nicest API
to work with. So add function called OpenTransientFile, which returns a
unbuffered fd that's cleaned up like the FILE* returned by AllocateFile().
This plugs a few rare fd leaks in error cases:
1. copy_file() - fixed by by using OpenTransientFile instead of BasicOpenFile
2. XLogFileInit() - fixed by adding close() calls to the error cases. Can't
use OpenTransientFile here because the fd is supposed to persist over
transaction boundaries.
3. lo_import/lo_export - fixed by using OpenTransientFile instead of
PathNameOpenFile.
In addition to plugging those leaks, this replaces many BasicOpenFile() calls
with OpenTransientFile() that were not leaking, because the code meticulously
closed the file on error. That wasn't strictly necessary, but IMHO it's good
for robustness.
The same leaks exist in older versions, but given the rarity of the issues,
I'm not backpatching this. Not yet, anyway - it might be good to backpatch
later, after this mechanism has had some more testing in master branch.
2012-11-27 09:25:50 +01:00
|
|
|
CloseTransientFile(fd);
|
2005-06-18 00:32:51 +02:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2015-04-14 16:03:42 +02:00
|
|
|
crc_offset = stat.st_size - sizeof(pg_crc32c);
|
2005-06-18 00:32:51 +02:00
|
|
|
if (crc_offset != MAXALIGN(crc_offset))
|
|
|
|
{
|
Add OpenTransientFile, with automatic cleanup at end-of-xact.
Files opened with BasicOpenFile or PathNameOpenFile are not automatically
cleaned up on error. That puts unnecessary burden on callers that only want
to keep the file open for a short time. There is AllocateFile, but that
returns a buffered FILE * stream, which in many cases is not the nicest API
to work with. So add function called OpenTransientFile, which returns a
unbuffered fd that's cleaned up like the FILE* returned by AllocateFile().
This plugs a few rare fd leaks in error cases:
1. copy_file() - fixed by by using OpenTransientFile instead of BasicOpenFile
2. XLogFileInit() - fixed by adding close() calls to the error cases. Can't
use OpenTransientFile here because the fd is supposed to persist over
transaction boundaries.
3. lo_import/lo_export - fixed by using OpenTransientFile instead of
PathNameOpenFile.
In addition to plugging those leaks, this replaces many BasicOpenFile() calls
with OpenTransientFile() that were not leaking, because the code meticulously
closed the file on error. That wasn't strictly necessary, but IMHO it's good
for robustness.
The same leaks exist in older versions, but given the rarity of the issues,
I'm not backpatching this. Not yet, anyway - it might be good to backpatch
later, after this mechanism has had some more testing in master branch.
2012-11-27 09:25:50 +01:00
|
|
|
CloseTransientFile(fd);
|
2005-06-18 00:32:51 +02:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* OK, slurp in the file.
|
|
|
|
*/
|
|
|
|
buf = (char *) palloc(stat.st_size);
|
|
|
|
|
Create and use wait events for read, write, and fsync operations.
Previous commits, notably 53be0b1add7064ca5db3cd884302dfc3268d884e and
6f3bd98ebfc008cbd676da777bb0b2376c4c4bfa, made it possible to see from
pg_stat_activity when a backend was stuck waiting for another backend,
but it's also fairly common for a backend to be stuck waiting for an
I/O. Add wait events for those operations, too.
Rushabh Lathia, with further hacking by me. Reviewed and tested by
Michael Paquier, Amit Kapila, Rajkumar Raghuwanshi, and Rahila Syed.
Discussion: http://postgr.es/m/CAGPqQf0LsYHXREPAZqYGVkDqHSyjf=KsD=k0GTVPAuzyThh-VQ@mail.gmail.com
2017-03-18 12:43:01 +01:00
|
|
|
pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_READ);
|
2005-06-18 00:32:51 +02:00
|
|
|
if (read(fd, buf, stat.st_size) != stat.st_size)
|
|
|
|
{
|
Create and use wait events for read, write, and fsync operations.
Previous commits, notably 53be0b1add7064ca5db3cd884302dfc3268d884e and
6f3bd98ebfc008cbd676da777bb0b2376c4c4bfa, made it possible to see from
pg_stat_activity when a backend was stuck waiting for another backend,
but it's also fairly common for a backend to be stuck waiting for an
I/O. Add wait events for those operations, too.
Rushabh Lathia, with further hacking by me. Reviewed and tested by
Michael Paquier, Amit Kapila, Rajkumar Raghuwanshi, and Rahila Syed.
Discussion: http://postgr.es/m/CAGPqQf0LsYHXREPAZqYGVkDqHSyjf=KsD=k0GTVPAuzyThh-VQ@mail.gmail.com
2017-03-18 12:43:01 +01:00
|
|
|
pgstat_report_wait_end();
|
Add OpenTransientFile, with automatic cleanup at end-of-xact.
Files opened with BasicOpenFile or PathNameOpenFile are not automatically
cleaned up on error. That puts unnecessary burden on callers that only want
to keep the file open for a short time. There is AllocateFile, but that
returns a buffered FILE * stream, which in many cases is not the nicest API
to work with. So add function called OpenTransientFile, which returns a
unbuffered fd that's cleaned up like the FILE* returned by AllocateFile().
This plugs a few rare fd leaks in error cases:
1. copy_file() - fixed by by using OpenTransientFile instead of BasicOpenFile
2. XLogFileInit() - fixed by adding close() calls to the error cases. Can't
use OpenTransientFile here because the fd is supposed to persist over
transaction boundaries.
3. lo_import/lo_export - fixed by using OpenTransientFile instead of
PathNameOpenFile.
In addition to plugging those leaks, this replaces many BasicOpenFile() calls
with OpenTransientFile() that were not leaking, because the code meticulously
closed the file on error. That wasn't strictly necessary, but IMHO it's good
for robustness.
The same leaks exist in older versions, but given the rarity of the issues,
I'm not backpatching this. Not yet, anyway - it might be good to backpatch
later, after this mechanism has had some more testing in master branch.
2012-11-27 09:25:50 +01:00
|
|
|
CloseTransientFile(fd);
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
if (give_warnings)
|
|
|
|
ereport(WARNING,
|
|
|
|
(errcode_for_file_access(),
|
|
|
|
errmsg("could not read two-phase state file \"%s\": %m",
|
|
|
|
path)));
|
2005-06-18 00:32:51 +02:00
|
|
|
pfree(buf);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
Create and use wait events for read, write, and fsync operations.
Previous commits, notably 53be0b1add7064ca5db3cd884302dfc3268d884e and
6f3bd98ebfc008cbd676da777bb0b2376c4c4bfa, made it possible to see from
pg_stat_activity when a backend was stuck waiting for another backend,
but it's also fairly common for a backend to be stuck waiting for an
I/O. Add wait events for those operations, too.
Rushabh Lathia, with further hacking by me. Reviewed and tested by
Michael Paquier, Amit Kapila, Rajkumar Raghuwanshi, and Rahila Syed.
Discussion: http://postgr.es/m/CAGPqQf0LsYHXREPAZqYGVkDqHSyjf=KsD=k0GTVPAuzyThh-VQ@mail.gmail.com
2017-03-18 12:43:01 +01:00
|
|
|
pgstat_report_wait_end();
|
Add OpenTransientFile, with automatic cleanup at end-of-xact.
Files opened with BasicOpenFile or PathNameOpenFile are not automatically
cleaned up on error. That puts unnecessary burden on callers that only want
to keep the file open for a short time. There is AllocateFile, but that
returns a buffered FILE * stream, which in many cases is not the nicest API
to work with. So add function called OpenTransientFile, which returns a
unbuffered fd that's cleaned up like the FILE* returned by AllocateFile().
This plugs a few rare fd leaks in error cases:
1. copy_file() - fixed by by using OpenTransientFile instead of BasicOpenFile
2. XLogFileInit() - fixed by adding close() calls to the error cases. Can't
use OpenTransientFile here because the fd is supposed to persist over
transaction boundaries.
3. lo_import/lo_export - fixed by using OpenTransientFile instead of
PathNameOpenFile.
In addition to plugging those leaks, this replaces many BasicOpenFile() calls
with OpenTransientFile() that were not leaking, because the code meticulously
closed the file on error. That wasn't strictly necessary, but IMHO it's good
for robustness.
The same leaks exist in older versions, but given the rarity of the issues,
I'm not backpatching this. Not yet, anyway - it might be good to backpatch
later, after this mechanism has had some more testing in master branch.
2012-11-27 09:25:50 +01:00
|
|
|
CloseTransientFile(fd);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
hdr = (TwoPhaseFileHeader *) buf;
|
|
|
|
if (hdr->magic != TWOPHASE_MAGIC || hdr->total_len != stat.st_size)
|
|
|
|
{
|
|
|
|
pfree(buf);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
Switch to CRC-32C in WAL and other places.
The old algorithm was found to not be the usual CRC-32 algorithm, used by
Ethernet et al. We were using a non-reflected lookup table with code meant
for a reflected lookup table. That's a strange combination that AFAICS does
not correspond to any bit-wise CRC calculation, which makes it difficult to
reason about its properties. Although it has worked well in practice, seems
safer to use a well-known algorithm.
Since we're changing the algorithm anyway, we might as well choose a
different polynomial. The Castagnoli polynomial has better error-correcting
properties than the traditional CRC-32 polynomial, even if we had
implemented it correctly. Another reason for picking that is that some new
CPUs have hardware support for calculating CRC-32C, but not CRC-32, let
alone our strange variant of it. This patch doesn't add any support for such
hardware, but a future patch could now do that.
The old algorithm is kept around for tsquery and pg_trgm, which use the
values in indexes that need to remain compatible so that pg_upgrade works.
While we're at it, share the old lookup table for CRC-32 calculation
between hstore, ltree and core. They all use the same table, so might as
well.
2014-11-04 10:35:15 +01:00
|
|
|
INIT_CRC32C(calc_crc);
|
|
|
|
COMP_CRC32C(calc_crc, buf, crc_offset);
|
|
|
|
FIN_CRC32C(calc_crc);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2015-04-14 16:03:42 +02:00
|
|
|
file_crc = *((pg_crc32c *) (buf + crc_offset));
|
2005-06-18 00:32:51 +02:00
|
|
|
|
Switch to CRC-32C in WAL and other places.
The old algorithm was found to not be the usual CRC-32 algorithm, used by
Ethernet et al. We were using a non-reflected lookup table with code meant
for a reflected lookup table. That's a strange combination that AFAICS does
not correspond to any bit-wise CRC calculation, which makes it difficult to
reason about its properties. Although it has worked well in practice, seems
safer to use a well-known algorithm.
Since we're changing the algorithm anyway, we might as well choose a
different polynomial. The Castagnoli polynomial has better error-correcting
properties than the traditional CRC-32 polynomial, even if we had
implemented it correctly. Another reason for picking that is that some new
CPUs have hardware support for calculating CRC-32C, but not CRC-32, let
alone our strange variant of it. This patch doesn't add any support for such
hardware, but a future patch could now do that.
The old algorithm is kept around for tsquery and pg_trgm, which use the
values in indexes that need to remain compatible so that pg_upgrade works.
While we're at it, share the old lookup table for CRC-32 calculation
between hstore, ltree and core. They all use the same table, so might as
well.
2014-11-04 10:35:15 +01:00
|
|
|
if (!EQ_CRC32C(calc_crc, file_crc))
|
2005-06-18 00:32:51 +02:00
|
|
|
{
|
|
|
|
pfree(buf);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
2016-01-21 03:40:44 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Reads 2PC data from xlog. During checkpoint this data will be moved to
|
|
|
|
* twophase files and ReadTwoPhaseFile should be used instead.
|
|
|
|
*
|
2017-04-04 21:56:56 +02:00
|
|
|
* Note clearly that this function can access WAL during normal operation,
|
|
|
|
* similarly to the way WALSender or Logical Decoding would do.
|
|
|
|
*
|
2016-01-21 03:40:44 +01:00
|
|
|
*/
|
|
|
|
static void
|
|
|
|
XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len)
|
|
|
|
{
|
|
|
|
XLogRecord *record;
|
|
|
|
XLogReaderState *xlogreader;
|
|
|
|
char *errormsg;
|
|
|
|
|
|
|
|
xlogreader = XLogReaderAllocate(&read_local_xlog_page, NULL);
|
|
|
|
if (!xlogreader)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_OUT_OF_MEMORY),
|
|
|
|
errmsg("out of memory"),
|
2017-03-13 20:40:16 +01:00
|
|
|
errdetail("Failed while allocating a WAL reading processor.")));
|
2016-01-21 03:40:44 +01:00
|
|
|
|
|
|
|
record = XLogReadRecord(xlogreader, lsn, &errormsg);
|
|
|
|
if (record == NULL)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode_for_file_access(),
|
2017-03-13 20:40:16 +01:00
|
|
|
errmsg("could not read two-phase state from WAL at %X/%X",
|
2016-06-10 00:02:36 +02:00
|
|
|
(uint32) (lsn >> 32),
|
|
|
|
(uint32) lsn)));
|
2016-01-21 03:40:44 +01:00
|
|
|
|
|
|
|
if (XLogRecGetRmid(xlogreader) != RM_XACT_ID ||
|
|
|
|
(XLogRecGetInfo(xlogreader) & XLOG_XACT_OPMASK) != XLOG_XACT_PREPARE)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode_for_file_access(),
|
2017-03-13 20:40:16 +01:00
|
|
|
errmsg("expected two-phase state data is not present in WAL at %X/%X",
|
2016-06-10 00:02:36 +02:00
|
|
|
(uint32) (lsn >> 32),
|
|
|
|
(uint32) lsn)));
|
2016-01-21 03:40:44 +01:00
|
|
|
|
|
|
|
if (len != NULL)
|
|
|
|
*len = XLogRecGetDataLen(xlogreader);
|
|
|
|
|
2016-06-10 00:02:36 +02:00
|
|
|
*buf = palloc(sizeof(char) * XLogRecGetDataLen(xlogreader));
|
2016-01-21 03:40:44 +01:00
|
|
|
memcpy(*buf, XLogRecGetData(xlogreader), sizeof(char) * XLogRecGetDataLen(xlogreader));
|
|
|
|
|
|
|
|
XLogReaderFree(xlogreader);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
/*
|
|
|
|
* Confirms an xid is prepared, during recovery
|
|
|
|
*/
|
|
|
|
bool
|
|
|
|
StandbyTransactionIdIsPrepared(TransactionId xid)
|
|
|
|
{
|
|
|
|
char *buf;
|
|
|
|
TwoPhaseFileHeader *hdr;
|
|
|
|
bool result;
|
|
|
|
|
|
|
|
Assert(TransactionIdIsValid(xid));
|
|
|
|
|
2010-04-28 02:09:05 +02:00
|
|
|
if (max_prepared_xacts <= 0)
|
2010-07-06 21:19:02 +02:00
|
|
|
return false; /* nothing to do */
|
2010-04-28 02:09:05 +02:00
|
|
|
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
/* Read and validate file */
|
|
|
|
buf = ReadTwoPhaseFile(xid, false);
|
|
|
|
if (buf == NULL)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* Check header also */
|
|
|
|
hdr = (TwoPhaseFileHeader *) buf;
|
|
|
|
result = TransactionIdEquals(hdr->xid, xid);
|
|
|
|
pfree(buf);
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* FinishPreparedTransaction: execute COMMIT PREPARED or ROLLBACK PREPARED
|
|
|
|
*/
|
|
|
|
void
|
2005-06-18 21:33:42 +02:00
|
|
|
FinishPreparedTransaction(const char *gid, bool isCommit)
|
2005-06-18 00:32:51 +02:00
|
|
|
{
|
|
|
|
GlobalTransaction gxact;
|
2011-11-25 14:02:10 +01:00
|
|
|
PGPROC *proc;
|
|
|
|
PGXACT *pgxact;
|
2005-06-18 00:32:51 +02:00
|
|
|
TransactionId xid;
|
2005-10-15 04:49:52 +02:00
|
|
|
char *buf;
|
|
|
|
char *bufptr;
|
2005-06-18 00:32:51 +02:00
|
|
|
TwoPhaseFileHeader *hdr;
|
2007-09-08 22:31:15 +02:00
|
|
|
TransactionId latestXid;
|
2005-06-18 00:32:51 +02:00
|
|
|
TransactionId *children;
|
2008-11-19 11:34:52 +01:00
|
|
|
RelFileNode *commitrels;
|
|
|
|
RelFileNode *abortrels;
|
|
|
|
RelFileNode *delrels;
|
|
|
|
int ndelrels;
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
SharedInvalidationMessage *invalmsgs;
|
2005-10-15 04:49:52 +02:00
|
|
|
int i;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Validate the GID, and lock the GXACT to ensure that two backends do not
|
|
|
|
* try to commit the same GID at once.
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
|
|
|
gxact = LockGXact(gid, GetUserId());
|
2011-11-25 14:02:10 +01:00
|
|
|
proc = &ProcGlobal->allProcs[gxact->pgprocno];
|
|
|
|
pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
|
|
|
|
xid = pgxact->xid;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/*
|
2016-06-10 00:02:36 +02:00
|
|
|
* Read and validate 2PC state data. State data will typically be stored
|
|
|
|
* in WAL files if the LSN is after the last checkpoint record, or moved
|
|
|
|
* to disk if for some reason they have lived for a long time.
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
2016-01-21 03:40:44 +01:00
|
|
|
if (gxact->ondisk)
|
|
|
|
buf = ReadTwoPhaseFile(xid, true);
|
|
|
|
else
|
|
|
|
XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL);
|
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Disassemble the header area
|
|
|
|
*/
|
|
|
|
hdr = (TwoPhaseFileHeader *) buf;
|
|
|
|
Assert(TransactionIdEquals(hdr->xid, xid));
|
|
|
|
bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
|
2016-03-10 13:51:46 +01:00
|
|
|
bufptr += MAXALIGN(hdr->gidlen);
|
2005-06-18 00:32:51 +02:00
|
|
|
children = (TransactionId *) bufptr;
|
|
|
|
bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId));
|
2008-11-19 11:34:52 +01:00
|
|
|
commitrels = (RelFileNode *) bufptr;
|
|
|
|
bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode));
|
|
|
|
abortrels = (RelFileNode *) bufptr;
|
|
|
|
bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode));
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
invalmsgs = (SharedInvalidationMessage *) bufptr;
|
|
|
|
bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage));
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2007-09-08 22:31:15 +02:00
|
|
|
/* compute latestXid among all children */
|
|
|
|
latestXid = TransactionIdLatest(xid, hdr->nsubxacts, children);
|
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
/*
|
|
|
|
* The order of operations here is critical: make the XLOG entry for
|
|
|
|
* commit or abort, then mark the transaction committed or aborted in
|
2017-03-17 14:46:58 +01:00
|
|
|
* pg_xact, then remove its PGPROC from the global ProcArray (which means
|
2005-10-15 04:49:52 +02:00
|
|
|
* TransactionIdIsInProgress will stop saying the prepared xact is in
|
|
|
|
* progress), then run the post-commit or post-abort callbacks. The
|
|
|
|
* callbacks will release the locks the transaction held.
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
|
|
|
if (isCommit)
|
|
|
|
RecordTransactionCommitPrepared(xid,
|
|
|
|
hdr->nsubxacts, children,
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
hdr->ncommitrels, commitrels,
|
|
|
|
hdr->ninvalmsgs, invalmsgs,
|
|
|
|
hdr->initfileinval);
|
2005-06-18 00:32:51 +02:00
|
|
|
else
|
|
|
|
RecordTransactionAbortPrepared(xid,
|
|
|
|
hdr->nsubxacts, children,
|
|
|
|
hdr->nabortrels, abortrels);
|
|
|
|
|
2011-11-25 14:02:10 +01:00
|
|
|
ProcArrayRemove(proc, latestXid);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* In case we fail while running the callbacks, mark the gxact invalid so
|
2015-05-24 03:35:49 +02:00
|
|
|
* no one else will try to commit/rollback, and so it will be recycled if
|
|
|
|
* we fail after this point. It is still locked by our backend so it
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
* won't go away yet.
|
2005-06-19 22:00:39 +02:00
|
|
|
*
|
|
|
|
* (We assume it's safe to do this without taking TwoPhaseStateLock.)
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
|
|
|
gxact->valid = false;
|
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* We have to remove any files that were supposed to be dropped. For
|
|
|
|
* consistency with the regular xact.c code paths, must do this before
|
|
|
|
* releasing locks, so do it before running the callbacks.
|
2005-06-18 07:21:09 +02:00
|
|
|
*
|
2005-06-18 00:32:51 +02:00
|
|
|
* NB: this code knows that we couldn't be dropping any temp rels ...
|
|
|
|
*/
|
|
|
|
if (isCommit)
|
|
|
|
{
|
2008-11-19 11:34:52 +01:00
|
|
|
delrels = commitrels;
|
|
|
|
ndelrels = hdr->ncommitrels;
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2008-11-19 11:34:52 +01:00
|
|
|
delrels = abortrels;
|
|
|
|
ndelrels = hdr->nabortrels;
|
|
|
|
}
|
|
|
|
for (i = 0; i < ndelrels; i++)
|
|
|
|
{
|
2010-08-13 22:10:54 +02:00
|
|
|
SMgrRelation srel = smgropen(delrels[i], InvalidBackendId);
|
2008-11-19 11:34:52 +01:00
|
|
|
|
2012-06-07 23:42:27 +02:00
|
|
|
smgrdounlink(srel, false);
|
2008-11-19 11:34:52 +01:00
|
|
|
smgrclose(srel);
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
|
|
|
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
/*
|
|
|
|
* Handle cache invalidation messages.
|
|
|
|
*
|
2010-02-26 03:01:40 +01:00
|
|
|
* Relcache init file invalidation requires processing both before and
|
|
|
|
* after we send the SI messages. See AtEOXact_Inval()
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
*/
|
|
|
|
if (hdr->initfileinval)
|
2011-08-16 19:11:54 +02:00
|
|
|
RelationCacheInitFilePreInvalidate();
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
SendSharedInvalidMessages(invalmsgs, hdr->ninvalmsgs);
|
|
|
|
if (hdr->initfileinval)
|
2011-08-16 19:11:54 +02:00
|
|
|
RelationCacheInitFilePostInvalidate();
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
|
2005-06-18 07:21:09 +02:00
|
|
|
/* And now do the callbacks */
|
|
|
|
if (isCommit)
|
|
|
|
ProcessRecords(bufptr, xid, twophase_postcommit_callbacks);
|
|
|
|
else
|
|
|
|
ProcessRecords(bufptr, xid, twophase_postabort_callbacks);
|
|
|
|
|
Implement genuine serializable isolation level.
Until now, our Serializable mode has in fact been what's called Snapshot
Isolation, which allows some anomalies that could not occur in any
serialized ordering of the transactions. This patch fixes that using a
method called Serializable Snapshot Isolation, based on research papers by
Michael J. Cahill (see README-SSI for full references). In Serializable
Snapshot Isolation, transactions run like they do in Snapshot Isolation,
but a predicate lock manager observes the reads and writes performed and
aborts transactions if it detects that an anomaly might occur. This method
produces some false positives, ie. it sometimes aborts transactions even
though there is no anomaly.
To track reads we implement predicate locking, see storage/lmgr/predicate.c.
Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared
memory is finite, so when a transaction takes many tuple-level locks on a
page, the locks are promoted to a single page-level lock, and further to a
single relation level lock if necessary. To lock key values with no matching
tuple, a sequential scan always takes a relation-level lock, and an index
scan acquires a page-level lock that covers the search key, whether or not
there are any matching keys at the moment.
A predicate lock doesn't conflict with any regular locks or with another
predicate locks in the normal sense. They're only used by the predicate lock
manager to detect the danger of anomalies. Only serializable transactions
participate in predicate locking, so there should be no extra overhead for
for other transactions.
Predicate locks can't be released at commit, but must be remembered until
all the transactions that overlapped with it have completed. That means that
we need to remember an unbounded amount of predicate locks, so we apply a
lossy but conservative method of tracking locks for committed transactions.
If we run short of shared memory, we overflow to a new "pg_serial" SLRU
pool.
We don't currently allow Serializable transactions in Hot Standby mode.
That would be hard, because even read-only transactions can cause anomalies
that wouldn't otherwise occur.
Serializable isolation mode now means the new fully serializable level.
Repeatable Read gives you the old Snapshot Isolation level that we have
always had.
Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and
Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
|
|
|
PredicateLockTwoPhaseFinish(xid, isCommit);
|
|
|
|
|
2007-05-27 05:50:39 +02:00
|
|
|
/* Count the prepared xact as committed or aborted */
|
|
|
|
AtEOXact_PgStat(isCommit);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/*
|
2016-01-21 03:40:44 +01:00
|
|
|
* And now we can clean up any files we may have left.
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
2016-01-21 03:40:44 +01:00
|
|
|
if (gxact->ondisk)
|
|
|
|
RemoveTwoPhaseFile(xid, true);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
RemoveGXact(gxact);
|
Fix race condition in preparing a transaction for two-phase commit.
To lock a prepared transaction's shared memory entry, we used to mark it
with the XID of the backend. When the XID was no longer active according
to the proc array, the entry was implicitly considered as not locked
anymore. However, when preparing a transaction, the backend's proc array
entry was cleared before transfering the locks (and some other state) to
the prepared transaction's dummy PGPROC entry, so there was a window where
another backend could finish the transaction before it was in fact fully
prepared.
To fix, rewrite the locking mechanism of global transaction entries. Instead
of an XID, just have simple locked-or-not flag in each entry (we store the
locking backend's backend id rather than a simple boolean, but that's just
for debugging purposes). The backend is responsible for explicitly unlocking
the entry, and to make sure that that happens, install a callback to unlock
it on abort or process exit.
Backpatch to all supported versions.
2014-05-15 15:37:50 +02:00
|
|
|
MyLockedGxact = NULL;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
pfree(buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2016-01-21 03:40:44 +01:00
|
|
|
* Scan 2PC state data in memory and call the indicated callbacks for each 2PC record.
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
|
|
|
static void
|
|
|
|
ProcessRecords(char *bufptr, TransactionId xid,
|
|
|
|
const TwoPhaseCallback callbacks[])
|
|
|
|
{
|
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
TwoPhaseRecordOnDisk *record = (TwoPhaseRecordOnDisk *) bufptr;
|
|
|
|
|
|
|
|
Assert(record->rmid <= TWOPHASE_RM_MAX_ID);
|
|
|
|
if (record->rmid == TWOPHASE_RM_END_ID)
|
|
|
|
break;
|
|
|
|
|
|
|
|
bufptr += MAXALIGN(sizeof(TwoPhaseRecordOnDisk));
|
|
|
|
|
|
|
|
if (callbacks[record->rmid] != NULL)
|
2005-10-15 04:49:52 +02:00
|
|
|
callbacks[record->rmid] (xid, record->info,
|
|
|
|
(void *) bufptr, record->len);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
bufptr += MAXALIGN(record->len);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Remove the 2PC file for the specified XID.
|
|
|
|
*
|
|
|
|
* If giveWarning is false, do not complain about file-not-present;
|
|
|
|
* this is an expected case during WAL replay.
|
|
|
|
*/
|
2017-04-04 21:56:56 +02:00
|
|
|
static void
|
2005-06-18 00:32:51 +02:00
|
|
|
RemoveTwoPhaseFile(TransactionId xid, bool giveWarning)
|
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
char path[MAXPGPATH];
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
TwoPhaseFilePath(path, xid);
|
|
|
|
if (unlink(path))
|
|
|
|
if (errno != ENOENT || giveWarning)
|
|
|
|
ereport(WARNING,
|
|
|
|
(errcode_for_file_access(),
|
2007-11-15 22:14:46 +01:00
|
|
|
errmsg("could not remove two-phase state file \"%s\": %m",
|
|
|
|
path)));
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2016-01-21 03:40:44 +01:00
|
|
|
* Recreates a state file. This is used in WAL replay and during
|
|
|
|
* checkpoint creation.
|
2005-06-18 00:32:51 +02:00
|
|
|
*
|
|
|
|
* Note: content and len don't include CRC.
|
|
|
|
*/
|
2017-04-04 21:56:56 +02:00
|
|
|
static void
|
2005-06-18 00:32:51 +02:00
|
|
|
RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
|
|
|
|
{
|
|
|
|
char path[MAXPGPATH];
|
2015-04-14 16:03:42 +02:00
|
|
|
pg_crc32c statefile_crc;
|
2005-06-18 00:32:51 +02:00
|
|
|
int fd;
|
|
|
|
|
|
|
|
/* Recompute CRC */
|
Switch to CRC-32C in WAL and other places.
The old algorithm was found to not be the usual CRC-32 algorithm, used by
Ethernet et al. We were using a non-reflected lookup table with code meant
for a reflected lookup table. That's a strange combination that AFAICS does
not correspond to any bit-wise CRC calculation, which makes it difficult to
reason about its properties. Although it has worked well in practice, seems
safer to use a well-known algorithm.
Since we're changing the algorithm anyway, we might as well choose a
different polynomial. The Castagnoli polynomial has better error-correcting
properties than the traditional CRC-32 polynomial, even if we had
implemented it correctly. Another reason for picking that is that some new
CPUs have hardware support for calculating CRC-32C, but not CRC-32, let
alone our strange variant of it. This patch doesn't add any support for such
hardware, but a future patch could now do that.
The old algorithm is kept around for tsquery and pg_trgm, which use the
values in indexes that need to remain compatible so that pg_upgrade works.
While we're at it, share the old lookup table for CRC-32 calculation
between hstore, ltree and core. They all use the same table, so might as
well.
2014-11-04 10:35:15 +01:00
|
|
|
INIT_CRC32C(statefile_crc);
|
|
|
|
COMP_CRC32C(statefile_crc, content, len);
|
|
|
|
FIN_CRC32C(statefile_crc);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
TwoPhaseFilePath(path, xid);
|
|
|
|
|
Add OpenTransientFile, with automatic cleanup at end-of-xact.
Files opened with BasicOpenFile or PathNameOpenFile are not automatically
cleaned up on error. That puts unnecessary burden on callers that only want
to keep the file open for a short time. There is AllocateFile, but that
returns a buffered FILE * stream, which in many cases is not the nicest API
to work with. So add function called OpenTransientFile, which returns a
unbuffered fd that's cleaned up like the FILE* returned by AllocateFile().
This plugs a few rare fd leaks in error cases:
1. copy_file() - fixed by by using OpenTransientFile instead of BasicOpenFile
2. XLogFileInit() - fixed by adding close() calls to the error cases. Can't
use OpenTransientFile here because the fd is supposed to persist over
transaction boundaries.
3. lo_import/lo_export - fixed by using OpenTransientFile instead of
PathNameOpenFile.
In addition to plugging those leaks, this replaces many BasicOpenFile() calls
with OpenTransientFile() that were not leaking, because the code meticulously
closed the file on error. That wasn't strictly necessary, but IMHO it's good
for robustness.
The same leaks exist in older versions, but given the rarity of the issues,
I'm not backpatching this. Not yet, anyway - it might be good to backpatch
later, after this mechanism has had some more testing in master branch.
2012-11-27 09:25:50 +01:00
|
|
|
fd = OpenTransientFile(path,
|
|
|
|
O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY,
|
|
|
|
S_IRUSR | S_IWUSR);
|
2005-06-18 00:32:51 +02:00
|
|
|
if (fd < 0)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode_for_file_access(),
|
2006-10-06 19:14:01 +02:00
|
|
|
errmsg("could not recreate two-phase state file \"%s\": %m",
|
2005-06-18 00:32:51 +02:00
|
|
|
path)));
|
|
|
|
|
|
|
|
/* Write content and CRC */
|
Create and use wait events for read, write, and fsync operations.
Previous commits, notably 53be0b1add7064ca5db3cd884302dfc3268d884e and
6f3bd98ebfc008cbd676da777bb0b2376c4c4bfa, made it possible to see from
pg_stat_activity when a backend was stuck waiting for another backend,
but it's also fairly common for a backend to be stuck waiting for an
I/O. Add wait events for those operations, too.
Rushabh Lathia, with further hacking by me. Reviewed and tested by
Michael Paquier, Amit Kapila, Rajkumar Raghuwanshi, and Rahila Syed.
Discussion: http://postgr.es/m/CAGPqQf0LsYHXREPAZqYGVkDqHSyjf=KsD=k0GTVPAuzyThh-VQ@mail.gmail.com
2017-03-18 12:43:01 +01:00
|
|
|
pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_WRITE);
|
2005-06-18 00:32:51 +02:00
|
|
|
if (write(fd, content, len) != len)
|
|
|
|
{
|
Create and use wait events for read, write, and fsync operations.
Previous commits, notably 53be0b1add7064ca5db3cd884302dfc3268d884e and
6f3bd98ebfc008cbd676da777bb0b2376c4c4bfa, made it possible to see from
pg_stat_activity when a backend was stuck waiting for another backend,
but it's also fairly common for a backend to be stuck waiting for an
I/O. Add wait events for those operations, too.
Rushabh Lathia, with further hacking by me. Reviewed and tested by
Michael Paquier, Amit Kapila, Rajkumar Raghuwanshi, and Rahila Syed.
Discussion: http://postgr.es/m/CAGPqQf0LsYHXREPAZqYGVkDqHSyjf=KsD=k0GTVPAuzyThh-VQ@mail.gmail.com
2017-03-18 12:43:01 +01:00
|
|
|
pgstat_report_wait_end();
|
Add OpenTransientFile, with automatic cleanup at end-of-xact.
Files opened with BasicOpenFile or PathNameOpenFile are not automatically
cleaned up on error. That puts unnecessary burden on callers that only want
to keep the file open for a short time. There is AllocateFile, but that
returns a buffered FILE * stream, which in many cases is not the nicest API
to work with. So add function called OpenTransientFile, which returns a
unbuffered fd that's cleaned up like the FILE* returned by AllocateFile().
This plugs a few rare fd leaks in error cases:
1. copy_file() - fixed by by using OpenTransientFile instead of BasicOpenFile
2. XLogFileInit() - fixed by adding close() calls to the error cases. Can't
use OpenTransientFile here because the fd is supposed to persist over
transaction boundaries.
3. lo_import/lo_export - fixed by using OpenTransientFile instead of
PathNameOpenFile.
In addition to plugging those leaks, this replaces many BasicOpenFile() calls
with OpenTransientFile() that were not leaking, because the code meticulously
closed the file on error. That wasn't strictly necessary, but IMHO it's good
for robustness.
The same leaks exist in older versions, but given the rarity of the issues,
I'm not backpatching this. Not yet, anyway - it might be good to backpatch
later, after this mechanism has had some more testing in master branch.
2012-11-27 09:25:50 +01:00
|
|
|
CloseTransientFile(fd);
|
2005-06-18 00:32:51 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode_for_file_access(),
|
2006-10-06 19:14:01 +02:00
|
|
|
errmsg("could not write two-phase state file: %m")));
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
2015-04-14 16:03:42 +02:00
|
|
|
if (write(fd, &statefile_crc, sizeof(pg_crc32c)) != sizeof(pg_crc32c))
|
2005-06-18 00:32:51 +02:00
|
|
|
{
|
Create and use wait events for read, write, and fsync operations.
Previous commits, notably 53be0b1add7064ca5db3cd884302dfc3268d884e and
6f3bd98ebfc008cbd676da777bb0b2376c4c4bfa, made it possible to see from
pg_stat_activity when a backend was stuck waiting for another backend,
but it's also fairly common for a backend to be stuck waiting for an
I/O. Add wait events for those operations, too.
Rushabh Lathia, with further hacking by me. Reviewed and tested by
Michael Paquier, Amit Kapila, Rajkumar Raghuwanshi, and Rahila Syed.
Discussion: http://postgr.es/m/CAGPqQf0LsYHXREPAZqYGVkDqHSyjf=KsD=k0GTVPAuzyThh-VQ@mail.gmail.com
2017-03-18 12:43:01 +01:00
|
|
|
pgstat_report_wait_end();
|
Add OpenTransientFile, with automatic cleanup at end-of-xact.
Files opened with BasicOpenFile or PathNameOpenFile are not automatically
cleaned up on error. That puts unnecessary burden on callers that only want
to keep the file open for a short time. There is AllocateFile, but that
returns a buffered FILE * stream, which in many cases is not the nicest API
to work with. So add function called OpenTransientFile, which returns a
unbuffered fd that's cleaned up like the FILE* returned by AllocateFile().
This plugs a few rare fd leaks in error cases:
1. copy_file() - fixed by by using OpenTransientFile instead of BasicOpenFile
2. XLogFileInit() - fixed by adding close() calls to the error cases. Can't
use OpenTransientFile here because the fd is supposed to persist over
transaction boundaries.
3. lo_import/lo_export - fixed by using OpenTransientFile instead of
PathNameOpenFile.
In addition to plugging those leaks, this replaces many BasicOpenFile() calls
with OpenTransientFile() that were not leaking, because the code meticulously
closed the file on error. That wasn't strictly necessary, but IMHO it's good
for robustness.
The same leaks exist in older versions, but given the rarity of the issues,
I'm not backpatching this. Not yet, anyway - it might be good to backpatch
later, after this mechanism has had some more testing in master branch.
2012-11-27 09:25:50 +01:00
|
|
|
CloseTransientFile(fd);
|
2005-06-18 00:32:51 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode_for_file_access(),
|
2006-10-06 19:14:01 +02:00
|
|
|
errmsg("could not write two-phase state file: %m")));
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
Create and use wait events for read, write, and fsync operations.
Previous commits, notably 53be0b1add7064ca5db3cd884302dfc3268d884e and
6f3bd98ebfc008cbd676da777bb0b2376c4c4bfa, made it possible to see from
pg_stat_activity when a backend was stuck waiting for another backend,
but it's also fairly common for a backend to be stuck waiting for an
I/O. Add wait events for those operations, too.
Rushabh Lathia, with further hacking by me. Reviewed and tested by
Michael Paquier, Amit Kapila, Rajkumar Raghuwanshi, and Rahila Syed.
Discussion: http://postgr.es/m/CAGPqQf0LsYHXREPAZqYGVkDqHSyjf=KsD=k0GTVPAuzyThh-VQ@mail.gmail.com
2017-03-18 12:43:01 +01:00
|
|
|
pgstat_report_wait_end();
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2005-06-19 22:00:39 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* We must fsync the file because the end-of-replay checkpoint will not do
|
|
|
|
* so, there being no GXACT in shared memory yet to tell it to.
|
2005-06-19 22:00:39 +02:00
|
|
|
*/
|
Create and use wait events for read, write, and fsync operations.
Previous commits, notably 53be0b1add7064ca5db3cd884302dfc3268d884e and
6f3bd98ebfc008cbd676da777bb0b2376c4c4bfa, made it possible to see from
pg_stat_activity when a backend was stuck waiting for another backend,
but it's also fairly common for a backend to be stuck waiting for an
I/O. Add wait events for those operations, too.
Rushabh Lathia, with further hacking by me. Reviewed and tested by
Michael Paquier, Amit Kapila, Rajkumar Raghuwanshi, and Rahila Syed.
Discussion: http://postgr.es/m/CAGPqQf0LsYHXREPAZqYGVkDqHSyjf=KsD=k0GTVPAuzyThh-VQ@mail.gmail.com
2017-03-18 12:43:01 +01:00
|
|
|
pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_SYNC);
|
2005-06-18 00:32:51 +02:00
|
|
|
if (pg_fsync(fd) != 0)
|
|
|
|
{
|
Add OpenTransientFile, with automatic cleanup at end-of-xact.
Files opened with BasicOpenFile or PathNameOpenFile are not automatically
cleaned up on error. That puts unnecessary burden on callers that only want
to keep the file open for a short time. There is AllocateFile, but that
returns a buffered FILE * stream, which in many cases is not the nicest API
to work with. So add function called OpenTransientFile, which returns a
unbuffered fd that's cleaned up like the FILE* returned by AllocateFile().
This plugs a few rare fd leaks in error cases:
1. copy_file() - fixed by by using OpenTransientFile instead of BasicOpenFile
2. XLogFileInit() - fixed by adding close() calls to the error cases. Can't
use OpenTransientFile here because the fd is supposed to persist over
transaction boundaries.
3. lo_import/lo_export - fixed by using OpenTransientFile instead of
PathNameOpenFile.
In addition to plugging those leaks, this replaces many BasicOpenFile() calls
with OpenTransientFile() that were not leaking, because the code meticulously
closed the file on error. That wasn't strictly necessary, but IMHO it's good
for robustness.
The same leaks exist in older versions, but given the rarity of the issues,
I'm not backpatching this. Not yet, anyway - it might be good to backpatch
later, after this mechanism has had some more testing in master branch.
2012-11-27 09:25:50 +01:00
|
|
|
CloseTransientFile(fd);
|
2005-06-18 00:32:51 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode_for_file_access(),
|
2006-10-06 19:14:01 +02:00
|
|
|
errmsg("could not fsync two-phase state file: %m")));
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
Create and use wait events for read, write, and fsync operations.
Previous commits, notably 53be0b1add7064ca5db3cd884302dfc3268d884e and
6f3bd98ebfc008cbd676da777bb0b2376c4c4bfa, made it possible to see from
pg_stat_activity when a backend was stuck waiting for another backend,
but it's also fairly common for a backend to be stuck waiting for an
I/O. Add wait events for those operations, too.
Rushabh Lathia, with further hacking by me. Reviewed and tested by
Michael Paquier, Amit Kapila, Rajkumar Raghuwanshi, and Rahila Syed.
Discussion: http://postgr.es/m/CAGPqQf0LsYHXREPAZqYGVkDqHSyjf=KsD=k0GTVPAuzyThh-VQ@mail.gmail.com
2017-03-18 12:43:01 +01:00
|
|
|
pgstat_report_wait_end();
|
2005-06-18 00:32:51 +02:00
|
|
|
|
Add OpenTransientFile, with automatic cleanup at end-of-xact.
Files opened with BasicOpenFile or PathNameOpenFile are not automatically
cleaned up on error. That puts unnecessary burden on callers that only want
to keep the file open for a short time. There is AllocateFile, but that
returns a buffered FILE * stream, which in many cases is not the nicest API
to work with. So add function called OpenTransientFile, which returns a
unbuffered fd that's cleaned up like the FILE* returned by AllocateFile().
This plugs a few rare fd leaks in error cases:
1. copy_file() - fixed by by using OpenTransientFile instead of BasicOpenFile
2. XLogFileInit() - fixed by adding close() calls to the error cases. Can't
use OpenTransientFile here because the fd is supposed to persist over
transaction boundaries.
3. lo_import/lo_export - fixed by using OpenTransientFile instead of
PathNameOpenFile.
In addition to plugging those leaks, this replaces many BasicOpenFile() calls
with OpenTransientFile() that were not leaking, because the code meticulously
closed the file on error. That wasn't strictly necessary, but IMHO it's good
for robustness.
The same leaks exist in older versions, but given the rarity of the issues,
I'm not backpatching this. Not yet, anyway - it might be good to backpatch
later, after this mechanism has had some more testing in master branch.
2012-11-27 09:25:50 +01:00
|
|
|
if (CloseTransientFile(fd) != 0)
|
2005-06-18 00:32:51 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode_for_file_access(),
|
2006-10-06 19:14:01 +02:00
|
|
|
errmsg("could not close two-phase state file: %m")));
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
|
|
|
|
2005-06-19 22:00:39 +02:00
|
|
|
/*
|
|
|
|
* CheckPointTwoPhase -- handle 2PC component of checkpointing.
|
|
|
|
*
|
2017-04-04 21:56:56 +02:00
|
|
|
* We must fsync the state file of any GXACT that is valid or has been
|
|
|
|
* generated during redo and has a PREPARE LSN <= the checkpoint's redo
|
|
|
|
* horizon. (If the gxact isn't valid yet, has not been generated in
|
|
|
|
* redo, or has a later LSN, this checkpoint is not responsible for
|
|
|
|
* fsyncing it.)
|
2005-06-19 22:00:39 +02:00
|
|
|
*
|
|
|
|
* This is deliberately run as late as possible in the checkpoint sequence,
|
|
|
|
* because GXACTs ordinarily have short lifespans, and so it is quite
|
|
|
|
* possible that GXACTs that were valid at checkpoint start will no longer
|
2016-01-21 03:40:44 +01:00
|
|
|
* exist if we wait a little bit. With typical checkpoint settings this
|
|
|
|
* will be about 3 minutes for an online checkpoint, so as a result we
|
|
|
|
* we expect that there will be no GXACTs that need to be copied to disk.
|
2005-06-19 22:00:39 +02:00
|
|
|
*
|
2016-01-21 03:40:44 +01:00
|
|
|
* If a GXACT remains valid across multiple checkpoints, it will already
|
|
|
|
* be on disk so we don't bother to repeat that write.
|
2005-06-19 22:00:39 +02:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
CheckPointTwoPhase(XLogRecPtr redo_horizon)
|
|
|
|
{
|
|
|
|
int i;
|
2016-01-21 03:40:44 +01:00
|
|
|
int serialized_xacts = 0;
|
2005-06-19 22:00:39 +02:00
|
|
|
|
|
|
|
if (max_prepared_xacts <= 0)
|
|
|
|
return; /* nothing to do */
|
2008-08-01 15:16:09 +02:00
|
|
|
|
|
|
|
TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_START();
|
|
|
|
|
2016-01-21 03:40:44 +01:00
|
|
|
/*
|
2016-06-10 00:02:36 +02:00
|
|
|
* We are expecting there to be zero GXACTs that need to be copied to
|
|
|
|
* disk, so we perform all I/O while holding TwoPhaseStateLock for
|
|
|
|
* simplicity. This prevents any new xacts from preparing while this
|
|
|
|
* occurs, which shouldn't be a problem since the presence of long-lived
|
|
|
|
* prepared xacts indicates the transaction manager isn't active.
|
2016-01-21 03:40:44 +01:00
|
|
|
*
|
2016-06-10 00:02:36 +02:00
|
|
|
* It's also possible to move I/O out of the lock, but on every error we
|
|
|
|
* should check whether somebody committed our transaction in different
|
2017-03-14 16:38:30 +01:00
|
|
|
* backend. Let's leave this optimization for future, if somebody will
|
2016-06-10 00:02:36 +02:00
|
|
|
* spot that this place cause bottleneck.
|
2016-01-21 03:40:44 +01:00
|
|
|
*
|
2016-06-10 00:02:36 +02:00
|
|
|
* Note that it isn't possible for there to be a GXACT with a
|
|
|
|
* prepare_end_lsn set prior to the last checkpoint yet is marked invalid,
|
|
|
|
* because of the efforts with delayChkpt.
|
2016-01-21 03:40:44 +01:00
|
|
|
*/
|
2005-06-19 22:00:39 +02:00
|
|
|
LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
|
|
|
|
for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
|
|
|
|
{
|
2017-04-04 21:56:56 +02:00
|
|
|
/* Note that we are using gxact not pgxact so this works in recovery also */
|
2005-10-15 04:49:52 +02:00
|
|
|
GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
|
2005-06-19 22:00:39 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
if ((gxact->valid || gxact->inredo) &&
|
2016-01-21 03:40:44 +01:00
|
|
|
!gxact->ondisk &&
|
|
|
|
gxact->prepare_end_lsn <= redo_horizon)
|
2005-06-19 22:00:39 +02:00
|
|
|
{
|
2016-01-21 03:40:44 +01:00
|
|
|
char *buf;
|
2016-06-10 00:02:36 +02:00
|
|
|
int len;
|
2005-06-19 22:00:39 +02:00
|
|
|
|
2016-01-21 03:40:44 +01:00
|
|
|
XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, &len);
|
2017-04-04 21:56:56 +02:00
|
|
|
RecreateTwoPhaseFile(gxact->xid, buf, len);
|
2016-01-21 03:40:44 +01:00
|
|
|
gxact->ondisk = true;
|
2017-04-04 21:56:56 +02:00
|
|
|
gxact->prepare_start_lsn = InvalidXLogRecPtr;
|
|
|
|
gxact->prepare_end_lsn = InvalidXLogRecPtr;
|
2016-01-21 03:40:44 +01:00
|
|
|
pfree(buf);
|
|
|
|
serialized_xacts++;
|
2005-06-19 22:00:39 +02:00
|
|
|
}
|
|
|
|
}
|
2016-01-21 03:40:44 +01:00
|
|
|
LWLockRelease(TwoPhaseStateLock);
|
2008-08-01 15:16:09 +02:00
|
|
|
|
2017-03-27 18:33:01 +02:00
|
|
|
/*
|
|
|
|
* Flush unconditionally the parent directory to make any information
|
|
|
|
* durable on disk. Two-phase files could have been removed and those
|
|
|
|
* removals need to be made persistent as well as any files newly created
|
|
|
|
* previously since the last checkpoint.
|
|
|
|
*/
|
|
|
|
fsync_fname(TWOPHASE_DIR, true);
|
|
|
|
|
2008-08-01 15:16:09 +02:00
|
|
|
TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_DONE();
|
2016-01-21 03:40:44 +01:00
|
|
|
|
|
|
|
if (log_checkpoints && serialized_xacts > 0)
|
|
|
|
ereport(LOG,
|
2016-07-28 22:18:35 +02:00
|
|
|
(errmsg_plural("%u two-phase state file was written "
|
2017-02-03 23:13:33 +01:00
|
|
|
"for a long-running prepared transaction",
|
2016-07-28 22:18:35 +02:00
|
|
|
"%u two-phase state files were written "
|
|
|
|
"for long-running prepared transactions",
|
|
|
|
serialized_xacts,
|
|
|
|
serialized_xacts)));
|
2005-06-19 22:00:39 +02:00
|
|
|
}
|
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
/*
|
|
|
|
* restoreTwoPhaseData
|
|
|
|
*
|
|
|
|
* Scan pg_twophase and fill TwoPhaseState depending on the on-disk data.
|
|
|
|
* This is called once at the beginning of recovery, saving any extra
|
|
|
|
* lookups in the future. Two-phase files that are newer than the
|
|
|
|
* minimum XID horizon are discarded on the way.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
restoreTwoPhaseData(void)
|
|
|
|
{
|
|
|
|
DIR *cldir;
|
|
|
|
struct dirent *clde;
|
|
|
|
|
|
|
|
cldir = AllocateDir(TWOPHASE_DIR);
|
|
|
|
while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL)
|
|
|
|
{
|
|
|
|
if (strlen(clde->d_name) == 8 &&
|
|
|
|
strspn(clde->d_name, "0123456789ABCDEF") == 8)
|
|
|
|
{
|
|
|
|
TransactionId xid;
|
|
|
|
char *buf;
|
|
|
|
|
|
|
|
xid = (TransactionId) strtoul(clde->d_name, NULL, 16);
|
|
|
|
|
|
|
|
buf = ProcessTwoPhaseBuffer(xid, InvalidXLogRecPtr,
|
|
|
|
true, false, false,
|
2017-04-18 12:14:05 +02:00
|
|
|
false);
|
2017-04-04 21:56:56 +02:00
|
|
|
if (buf == NULL)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
PrepareRedoAdd(buf, InvalidXLogRecPtr, InvalidXLogRecPtr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
FreeDir(cldir);
|
|
|
|
}
|
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
/*
|
|
|
|
* PrescanPreparedTransactions
|
|
|
|
*
|
2017-04-04 21:56:56 +02:00
|
|
|
* Scan the shared memory entries of TwoPhaseState and determine the range
|
|
|
|
* of valid XIDs present. This is run during database startup, after we
|
|
|
|
* have completed reading WAL. ShmemVariableCache->nextXid has been set to
|
|
|
|
* one more than the highest XID for which evidence exists in WAL.
|
2005-06-18 00:32:51 +02:00
|
|
|
*
|
|
|
|
* We throw away any prepared xacts with main XID beyond nextXid --- if any
|
|
|
|
* are present, it suggests that the DBA has done a PITR recovery to an
|
2014-05-06 18:12:18 +02:00
|
|
|
* earlier point in time without cleaning out pg_twophase. We dare not
|
2005-06-18 00:32:51 +02:00
|
|
|
* try to recover such prepared xacts since they likely depend on database
|
|
|
|
* state that doesn't exist now.
|
|
|
|
*
|
|
|
|
* However, we will advance nextXid beyond any subxact XIDs belonging to
|
|
|
|
* valid prepared xacts. We need to do this since subxact commit doesn't
|
|
|
|
* write a WAL entry, and so there might be no evidence in WAL of those
|
|
|
|
* subxact XIDs.
|
|
|
|
*
|
|
|
|
* Our other responsibility is to determine and return the oldest valid XID
|
|
|
|
* among the prepared xacts (if none, return ShmemVariableCache->nextXid).
|
|
|
|
* This is needed to synchronize pg_subtrans startup properly.
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
*
|
|
|
|
* If xids_p and nxids_p are not NULL, pointer to a palloc'd array of all
|
|
|
|
* top-level xids is stored in *xids_p. The number of entries in the array
|
|
|
|
* is returned in *nxids_p.
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
|
|
|
TransactionId
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
|
2005-06-18 00:32:51 +02:00
|
|
|
{
|
|
|
|
TransactionId origNextXid = ShmemVariableCache->nextXid;
|
|
|
|
TransactionId result = origNextXid;
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
TransactionId *xids = NULL;
|
|
|
|
int nxids = 0;
|
|
|
|
int allocsize = 0;
|
2017-04-04 21:56:56 +02:00
|
|
|
int i;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
|
|
|
|
for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
|
2005-06-18 00:32:51 +02:00
|
|
|
{
|
2017-04-04 21:56:56 +02:00
|
|
|
TransactionId xid;
|
|
|
|
char *buf;
|
|
|
|
GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
Assert(gxact->inredo);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
xid = gxact->xid;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
buf = ProcessTwoPhaseBuffer(xid,
|
|
|
|
gxact->prepare_start_lsn,
|
|
|
|
gxact->ondisk, false, false,
|
2017-04-18 12:14:05 +02:00
|
|
|
true);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
if (buf == NULL)
|
|
|
|
continue;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2017-04-18 12:14:05 +02:00
|
|
|
/*
|
|
|
|
* OK, we think this file is valid. Incorporate xid into the
|
|
|
|
* running-minimum result.
|
|
|
|
*/
|
|
|
|
if (TransactionIdPrecedes(xid, result))
|
|
|
|
result = xid;
|
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
if (xids_p)
|
|
|
|
{
|
|
|
|
if (nxids == allocsize)
|
2005-06-18 00:32:51 +02:00
|
|
|
{
|
2017-04-04 21:56:56 +02:00
|
|
|
if (nxids == 0)
|
2005-06-18 00:32:51 +02:00
|
|
|
{
|
2017-04-04 21:56:56 +02:00
|
|
|
allocsize = 10;
|
|
|
|
xids = palloc(allocsize * sizeof(TransactionId));
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
2017-04-04 21:56:56 +02:00
|
|
|
else
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
{
|
2017-04-04 21:56:56 +02:00
|
|
|
allocsize = allocsize * 2;
|
|
|
|
xids = repalloc(xids, allocsize * sizeof(TransactionId));
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
}
|
|
|
|
}
|
2017-04-04 21:56:56 +02:00
|
|
|
xids[nxids++] = xid;
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
2017-04-04 21:56:56 +02:00
|
|
|
|
|
|
|
pfree(buf);
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
2017-04-04 21:56:56 +02:00
|
|
|
LWLockRelease(TwoPhaseStateLock);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
if (xids_p)
|
|
|
|
{
|
|
|
|
*xids_p = xids;
|
|
|
|
*nxids_p = nxids;
|
|
|
|
}
|
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2010-04-13 16:17:46 +02:00
|
|
|
/*
|
|
|
|
* StandbyRecoverPreparedTransactions
|
|
|
|
*
|
2017-04-04 21:56:56 +02:00
|
|
|
* Scan the shared memory entries of TwoPhaseState and setup all the required
|
|
|
|
* information to allow standby queries to treat prepared transactions as still
|
|
|
|
* active.
|
|
|
|
*
|
2010-04-13 16:17:46 +02:00
|
|
|
* This is never called at the end of recovery - we use
|
|
|
|
* RecoverPreparedTransactions() at that point.
|
|
|
|
*
|
|
|
|
* Currently we simply call SubTransSetParent() for any subxids of prepared
|
|
|
|
* transactions. If overwriteOK is true, it's OK if some XIDs have already
|
|
|
|
* been marked in pg_subtrans.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
StandbyRecoverPreparedTransactions(bool overwriteOK)
|
|
|
|
{
|
2017-04-04 21:56:56 +02:00
|
|
|
int i;
|
2010-04-13 16:17:46 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
|
|
|
|
for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
|
2010-04-13 16:17:46 +02:00
|
|
|
{
|
2017-04-04 21:56:56 +02:00
|
|
|
TransactionId xid;
|
|
|
|
char *buf;
|
|
|
|
GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
|
2010-04-13 16:17:46 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
Assert(gxact->inredo);
|
2010-04-13 16:17:46 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
xid = gxact->xid;
|
2016-09-08 11:32:58 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
buf = ProcessTwoPhaseBuffer(xid,
|
|
|
|
gxact->prepare_start_lsn,
|
|
|
|
gxact->ondisk, overwriteOK, true,
|
2017-04-18 12:14:05 +02:00
|
|
|
false);
|
2017-04-04 21:56:56 +02:00
|
|
|
if (buf != NULL)
|
2016-09-08 11:32:58 +02:00
|
|
|
pfree(buf);
|
2010-04-13 16:17:46 +02:00
|
|
|
}
|
2017-04-04 21:56:56 +02:00
|
|
|
LWLockRelease(TwoPhaseStateLock);
|
2010-04-13 16:17:46 +02:00
|
|
|
}
|
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
/*
|
|
|
|
* RecoverPreparedTransactions
|
|
|
|
*
|
2017-04-04 21:56:56 +02:00
|
|
|
* Scan the shared memory entries of TwoPhaseState and reload the state for
|
|
|
|
* each prepared transaction (reacquire locks, etc).
|
|
|
|
*
|
|
|
|
* This is run during database startup.
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
RecoverPreparedTransactions(void)
|
|
|
|
{
|
2017-04-04 21:56:56 +02:00
|
|
|
int i;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
/*
|
|
|
|
* Don't need a lock in the recovery phase.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
|
2005-06-18 00:32:51 +02:00
|
|
|
{
|
2017-04-04 21:56:56 +02:00
|
|
|
TransactionId xid;
|
|
|
|
char *buf;
|
|
|
|
GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
|
|
|
|
char *bufptr;
|
|
|
|
TwoPhaseFileHeader *hdr;
|
|
|
|
TransactionId *subxids;
|
|
|
|
const char *gid;
|
|
|
|
bool overwriteOK = false;
|
|
|
|
int i;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
xid = gxact->xid;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
buf = ProcessTwoPhaseBuffer(xid,
|
|
|
|
gxact->prepare_start_lsn,
|
|
|
|
gxact->ondisk, false, false,
|
2017-04-18 12:14:05 +02:00
|
|
|
false);
|
2017-04-04 21:56:56 +02:00
|
|
|
if (buf == NULL)
|
|
|
|
continue;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
ereport(LOG,
|
|
|
|
(errmsg("recovering prepared transaction %u from shared memory", xid)));
|
|
|
|
|
|
|
|
hdr = (TwoPhaseFileHeader *) buf;
|
|
|
|
Assert(TransactionIdEquals(hdr->xid, xid));
|
|
|
|
bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
|
|
|
|
gid = (const char *) bufptr;
|
|
|
|
bufptr += MAXALIGN(hdr->gidlen);
|
|
|
|
subxids = (TransactionId *) bufptr;
|
|
|
|
bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId));
|
|
|
|
bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode));
|
|
|
|
bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode));
|
|
|
|
bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage));
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
/*
|
|
|
|
* It's possible that SubTransSetParent has been set before, if
|
|
|
|
* the prepared transaction generated xid assignment records. Test
|
|
|
|
* here must match one used in AssignTransactionId().
|
|
|
|
*/
|
|
|
|
if (InHotStandby && (hdr->nsubxacts >= PGPROC_MAX_CACHED_SUBXIDS ||
|
|
|
|
XLogLogicalInfoActive()))
|
|
|
|
overwriteOK = true;
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
/*
|
|
|
|
* Reconstruct subtrans state for the transaction --- needed
|
|
|
|
* because pg_subtrans is not preserved over a restart. Note that
|
|
|
|
* we are linking all the subtransactions directly to the
|
|
|
|
* top-level XID; there may originally have been a more complex
|
|
|
|
* hierarchy, but there's no need to restore that exactly.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < hdr->nsubxacts; i++)
|
|
|
|
SubTransSetParent(subxids[i], xid, overwriteOK);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
/*
|
|
|
|
* Recreate its GXACT and dummy PGPROC. But, check whether
|
|
|
|
* it was added in redo and already has a shmem entry for
|
|
|
|
* it.
|
|
|
|
*/
|
|
|
|
LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
|
|
|
|
MarkAsPreparingGuts(gxact, xid, gid,
|
|
|
|
hdr->prepared_at,
|
|
|
|
hdr->owner, hdr->database);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
/* recovered, so reset the flag for entries generated by redo */
|
|
|
|
gxact->inredo = false;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
LWLockRelease(TwoPhaseStateLock);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
GXactLoadSubxactData(gxact, hdr->nsubxacts, subxids);
|
|
|
|
MarkAsPrepared(gxact);
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
/*
|
|
|
|
* Recover other state (notably locks) using resource managers
|
|
|
|
*/
|
|
|
|
ProcessRecords(bufptr, xid, twophase_recover_callbacks);
|
Fix deadlock at startup, if max_prepared_transactions is too small.
When the startup process recovers transactions by scanning pg_twophase
directory, it should clear MyLockedGxact after it's done processing each
transaction. Like we do during normal operation, at PREPARE TRANSACTION.
Otherwise, if the startup process exits due to an error, it will try to
clear the locking_backend field of the last recovered transaction. That's
usually harmless, but if the error happens in MarkAsPreparing, while
holding TwoPhaseStateLock, the shmem-exit hook will try to acquire
TwoPhaseStateLock again, and deadlock with itself.
This fixes bug #13128 reported by Grant McAlister. The bug was introduced
by commit bb38fb0d, so backpatch to all supported versions like that
commit.
2015-04-23 20:25:44 +02:00
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
/*
|
|
|
|
* Release locks held by the standby process after we process each
|
|
|
|
* prepared transaction. As a result, we don't need too many
|
|
|
|
* additional locks at any one time.
|
|
|
|
*/
|
|
|
|
if (InHotStandby)
|
|
|
|
StandbyReleaseLockTree(xid, hdr->nsubxacts, subxids);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We're done with recovering this transaction. Clear
|
|
|
|
* MyLockedGxact, like we do in PrepareTransaction() during normal
|
|
|
|
* operation.
|
|
|
|
*/
|
|
|
|
PostPrepare_Twophase();
|
|
|
|
|
|
|
|
pfree(buf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ProcessTwoPhaseBuffer
|
|
|
|
*
|
|
|
|
* Given a transaction id, read it either from disk or read it directly
|
|
|
|
* via shmem xlog record pointer using the provided "prepare_start_lsn".
|
|
|
|
*
|
|
|
|
* If setParent is true, then use the overwriteOK parameter to set up
|
|
|
|
* subtransaction parent linkages.
|
|
|
|
*
|
2017-04-18 12:14:05 +02:00
|
|
|
* If setNextXid is true, set ShmemVariableCache->nextXid to the newest
|
|
|
|
* value scanned.
|
2017-04-04 21:56:56 +02:00
|
|
|
*/
|
|
|
|
static char *
|
|
|
|
ProcessTwoPhaseBuffer(TransactionId xid,
|
|
|
|
XLogRecPtr prepare_start_lsn,
|
|
|
|
bool fromdisk, bool overwriteOK,
|
2017-04-18 12:14:05 +02:00
|
|
|
bool setParent, bool setNextXid)
|
2017-04-04 21:56:56 +02:00
|
|
|
{
|
|
|
|
TransactionId origNextXid = ShmemVariableCache->nextXid;
|
|
|
|
TransactionId *subxids;
|
|
|
|
char *buf;
|
|
|
|
TwoPhaseFileHeader *hdr;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (!fromdisk)
|
|
|
|
Assert(prepare_start_lsn != InvalidXLogRecPtr);
|
|
|
|
|
|
|
|
/* Already processed? */
|
|
|
|
if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
|
|
|
|
{
|
|
|
|
if (fromdisk)
|
|
|
|
{
|
|
|
|
ereport(WARNING,
|
|
|
|
(errmsg("removing stale two-phase state file for \"%u\"",
|
|
|
|
xid)));
|
|
|
|
RemoveTwoPhaseFile(xid, true);
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
2017-04-04 21:56:56 +02:00
|
|
|
else
|
|
|
|
{
|
|
|
|
ereport(WARNING,
|
|
|
|
(errmsg("removing stale two-phase state from"
|
|
|
|
" shared memory for \"%u\"", xid)));
|
|
|
|
PrepareRedoRemove(xid, true);
|
|
|
|
}
|
|
|
|
return NULL;
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
2017-04-04 21:56:56 +02:00
|
|
|
|
|
|
|
/* Reject XID if too new */
|
|
|
|
if (TransactionIdFollowsOrEquals(xid, origNextXid))
|
|
|
|
{
|
|
|
|
if (fromdisk)
|
|
|
|
{
|
|
|
|
ereport(WARNING,
|
|
|
|
(errmsg("removing future two-phase state file for \"%u\"",
|
|
|
|
xid)));
|
|
|
|
RemoveTwoPhaseFile(xid, true);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
ereport(WARNING,
|
|
|
|
(errmsg("removing future two-phase state from memory for \"%u\"",
|
|
|
|
xid)));
|
|
|
|
PrepareRedoRemove(xid, true);
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fromdisk)
|
|
|
|
{
|
|
|
|
/* Read and validate file */
|
|
|
|
buf = ReadTwoPhaseFile(xid, true);
|
|
|
|
if (buf == NULL)
|
|
|
|
{
|
|
|
|
ereport(WARNING,
|
|
|
|
(errmsg("removing corrupt two-phase state file for \"%u\"",
|
|
|
|
xid)));
|
|
|
|
RemoveTwoPhaseFile(xid, true);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* Read xlog data */
|
|
|
|
XlogReadTwoPhaseData(prepare_start_lsn, &buf, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Deconstruct header */
|
|
|
|
hdr = (TwoPhaseFileHeader *) buf;
|
|
|
|
if (!TransactionIdEquals(hdr->xid, xid))
|
|
|
|
{
|
|
|
|
if (fromdisk)
|
|
|
|
{
|
|
|
|
ereport(WARNING,
|
|
|
|
(errmsg("removing corrupt two-phase state file for \"%u\"",
|
|
|
|
xid)));
|
|
|
|
RemoveTwoPhaseFile(xid, true);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
ereport(WARNING,
|
|
|
|
(errmsg("removing corrupt two-phase state from memory for \"%u\"",
|
|
|
|
xid)));
|
|
|
|
PrepareRedoRemove(xid, true);
|
|
|
|
}
|
|
|
|
pfree(buf);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Examine subtransaction XIDs ... they should all follow main
|
|
|
|
* XID, and they may force us to advance nextXid.
|
|
|
|
*/
|
|
|
|
subxids = (TransactionId *) (buf +
|
|
|
|
MAXALIGN(sizeof(TwoPhaseFileHeader)) +
|
|
|
|
MAXALIGN(hdr->gidlen));
|
|
|
|
for (i = 0; i < hdr->nsubxacts; i++)
|
|
|
|
{
|
|
|
|
TransactionId subxid = subxids[i];
|
|
|
|
|
|
|
|
Assert(TransactionIdFollows(subxid, xid));
|
2017-04-18 12:14:05 +02:00
|
|
|
|
|
|
|
/* update nextXid if needed */
|
|
|
|
if (setNextXid &&
|
|
|
|
TransactionIdFollowsOrEquals(subxid,
|
|
|
|
ShmemVariableCache->nextXid))
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We don't expect anyone else to modify nextXid, hence we don't
|
|
|
|
* need to hold a lock while examining it. We still acquire the
|
|
|
|
* lock to modify it, though, so we recheck.
|
|
|
|
*/
|
|
|
|
LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
|
|
|
|
if (TransactionIdFollowsOrEquals(subxid,
|
|
|
|
ShmemVariableCache->nextXid))
|
|
|
|
{
|
|
|
|
ShmemVariableCache->nextXid = subxid;
|
|
|
|
TransactionIdAdvance(ShmemVariableCache->nextXid);
|
|
|
|
}
|
|
|
|
LWLockRelease(XidGenLock);
|
|
|
|
}
|
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
if (setParent)
|
|
|
|
SubTransSetParent(xid, subxid, overwriteOK);
|
|
|
|
}
|
|
|
|
|
|
|
|
return buf;
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
|
|
|
|
2017-04-04 21:56:56 +02:00
|
|
|
|
2005-06-18 00:32:51 +02:00
|
|
|
/*
|
|
|
|
* RecordTransactionCommitPrepared
|
|
|
|
*
|
2015-09-29 19:40:56 +02:00
|
|
|
* This is basically the same as RecordTransactionCommit (q.v. if you change
|
|
|
|
* this function): in particular, we must set the delayChkpt flag to avoid a
|
|
|
|
* race condition.
|
2005-06-18 00:32:51 +02:00
|
|
|
*
|
|
|
|
* We know the transaction made at least one XLOG entry (its PREPARE),
|
|
|
|
* so it is never possible to optimize out the commit record.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
RecordTransactionCommitPrepared(TransactionId xid,
|
|
|
|
int nchildren,
|
|
|
|
TransactionId *children,
|
|
|
|
int nrels,
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
|
|
|
RelFileNode *rels,
|
|
|
|
int ninvalmsgs,
|
|
|
|
SharedInvalidationMessage *invalmsgs,
|
|
|
|
bool initfileinval)
|
2005-06-18 00:32:51 +02:00
|
|
|
{
|
|
|
|
XLogRecPtr recptr;
|
2015-09-29 19:40:56 +02:00
|
|
|
TimestampTz committs = GetCurrentTimestamp();
|
|
|
|
bool replorigin;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Are we using the replication origins feature? Or, in other words, are
|
|
|
|
* we replaying remote actions?
|
|
|
|
*/
|
|
|
|
replorigin = (replorigin_session_origin != InvalidRepOriginId &&
|
|
|
|
replorigin_session_origin != DoNotReplicateId);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
START_CRIT_SECTION();
|
|
|
|
|
|
|
|
/* See notes in RecordTransactionCommit */
|
2012-12-03 14:13:53 +01:00
|
|
|
MyPgXact->delayChkpt = true;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2017-03-22 14:09:36 +01:00
|
|
|
/*
|
|
|
|
* Emit the XLOG commit record. Note that we mark 2PC commits as potentially
|
|
|
|
* having AccessExclusiveLocks since we don't know whether or not they do.
|
|
|
|
*/
|
2015-09-29 19:40:56 +02:00
|
|
|
recptr = XactLogCommitRecord(committs,
|
Merge the various forms of transaction commit & abort records.
Since 465883b0a two versions of commit records have existed. A compact
version that was used when no cache invalidations, smgr unlinks and
similar were needed, and a full version that could deal with all
that. Additionally the full version was embedded into twophase commit
records.
That resulted in a measurable reduction in the size of the logged WAL in
some workloads. But more recently additions like logical decoding, which
e.g. needs information about the database something was executed on,
made it applicable in fewer situations. The static split generally made
it hard to expand the commit record, because concerns over the size made
it hard to add anything to the compact version.
Additionally it's not particularly pretty to have twophase.c insert
RM_XACT records.
Rejigger things so that the commit and abort records only have one form
each, including the twophase equivalents. The presence of the various
optional (in the sense of not being in every record) pieces is indicated
by a bits in the 'xinfo' flag. That flag previously was not included in
compact commit records. To prevent an increase in size due to its
presence, it's only included if necessary; signalled by a bit in the
xl_info bits available for xact.c, similar to heapam.c's
XLOG_HEAP_OPMASK/XLOG_HEAP_INIT_PAGE.
Twophase commit/aborts are now the same as their normal
counterparts. The original transaction's xid is included in an optional
data field.
This means that commit records generally are smaller, except in the case
of a transaction with subtransactions, but no other special cases; the
increase there is four bytes, which seems acceptable given that the more
common case of not having subtransactions shrank. The savings are
especially measurable for twophase commits, which previously always used
the full version; but will in practice only infrequently have required
that.
The motivation for this work are not the space savings and and
deduplication though; it's that it makes it easier to extend commit
records with additional information. That's just a few lines of code
now; without impacting the common case where that information is not
needed.
Discussion: 20150220152150.GD4149@awork2.anarazel.de,
235610.92468.qm%40web29004.mail.ird.yahoo.com
Reviewed-By: Heikki Linnakangas, Simon Riggs
2015-03-15 17:37:07 +01:00
|
|
|
nchildren, children, nrels, rels,
|
|
|
|
ninvalmsgs, invalmsgs,
|
|
|
|
initfileinval, false,
|
2017-03-22 14:09:36 +01:00
|
|
|
MyXactFlags | XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK,
|
Merge the various forms of transaction commit & abort records.
Since 465883b0a two versions of commit records have existed. A compact
version that was used when no cache invalidations, smgr unlinks and
similar were needed, and a full version that could deal with all
that. Additionally the full version was embedded into twophase commit
records.
That resulted in a measurable reduction in the size of the logged WAL in
some workloads. But more recently additions like logical decoding, which
e.g. needs information about the database something was executed on,
made it applicable in fewer situations. The static split generally made
it hard to expand the commit record, because concerns over the size made
it hard to add anything to the compact version.
Additionally it's not particularly pretty to have twophase.c insert
RM_XACT records.
Rejigger things so that the commit and abort records only have one form
each, including the twophase equivalents. The presence of the various
optional (in the sense of not being in every record) pieces is indicated
by a bits in the 'xinfo' flag. That flag previously was not included in
compact commit records. To prevent an increase in size due to its
presence, it's only included if necessary; signalled by a bit in the
xl_info bits available for xact.c, similar to heapam.c's
XLOG_HEAP_OPMASK/XLOG_HEAP_INIT_PAGE.
Twophase commit/aborts are now the same as their normal
counterparts. The original transaction's xid is included in an optional
data field.
This means that commit records generally are smaller, except in the case
of a transaction with subtransactions, but no other special cases; the
increase there is four bytes, which seems acceptable given that the more
common case of not having subtransactions shrank. The savings are
especially measurable for twophase commits, which previously always used
the full version; but will in practice only infrequently have required
that.
The motivation for this work are not the space savings and and
deduplication though; it's that it makes it easier to extend commit
records with additional information. That's just a few lines of code
now; without impacting the common case where that information is not
needed.
Discussion: 20150220152150.GD4149@awork2.anarazel.de,
235610.92468.qm%40web29004.mail.ird.yahoo.com
Reviewed-By: Heikki Linnakangas, Simon Riggs
2015-03-15 17:37:07 +01:00
|
|
|
xid);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2015-09-29 19:40:56 +02:00
|
|
|
|
|
|
|
if (replorigin)
|
|
|
|
/* Move LSNs forward for this replication origin */
|
|
|
|
replorigin_session_advance(replorigin_session_origin_lsn,
|
|
|
|
XactLastRecEnd);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Record commit timestamp. The value comes from plain commit timestamp
|
|
|
|
* if replorigin is not enabled, or replorigin already set a value for us
|
|
|
|
* in replorigin_session_origin_timestamp otherwise.
|
|
|
|
*
|
|
|
|
* We don't need to WAL-log anything here, as the commit record written
|
|
|
|
* above already contains the data.
|
|
|
|
*/
|
|
|
|
if (!replorigin || replorigin_session_origin_timestamp == 0)
|
|
|
|
replorigin_session_origin_timestamp = committs;
|
|
|
|
|
|
|
|
TransactionTreeSetCommitTsData(xid, nchildren, children,
|
|
|
|
replorigin_session_origin_timestamp,
|
2015-10-01 20:06:55 +02:00
|
|
|
replorigin_session_origin, false);
|
2015-09-29 19:40:56 +02:00
|
|
|
|
2007-08-02 00:45:09 +02:00
|
|
|
/*
|
2007-11-15 22:14:46 +01:00
|
|
|
* We don't currently try to sleep before flush here ... nor is there any
|
|
|
|
* support for async commit of a prepared xact (the very idea is probably
|
|
|
|
* a contradiction)
|
2007-08-02 00:45:09 +02:00
|
|
|
*/
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/* Flush XLOG to disk */
|
|
|
|
XLogFlush(recptr);
|
|
|
|
|
2017-03-17 14:46:58 +01:00
|
|
|
/* Mark the transaction committed in pg_xact */
|
2008-10-20 21:18:18 +02:00
|
|
|
TransactionIdCommitTree(xid, nchildren, children);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
2007-04-03 18:34:36 +02:00
|
|
|
/* Checkpoint can proceed now */
|
2012-12-03 14:13:53 +01:00
|
|
|
MyPgXact->delayChkpt = false;
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
END_CRIT_SECTION();
|
2011-03-06 23:49:16 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Wait for synchronous replication, if required.
|
|
|
|
*
|
2011-04-10 17:42:00 +02:00
|
|
|
* Note that at this stage we have marked clog, but still show as running
|
|
|
|
* in the procarray and continue to hold locks.
|
2011-03-06 23:49:16 +01:00
|
|
|
*/
|
2016-03-30 03:16:12 +02:00
|
|
|
SyncRepWaitForLSN(recptr, true);
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* RecordTransactionAbortPrepared
|
|
|
|
*
|
|
|
|
* This is basically the same as RecordTransactionAbort.
|
|
|
|
*
|
|
|
|
* We know the transaction made at least one XLOG entry (its PREPARE),
|
|
|
|
* so it is never possible to optimize out the abort record.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
RecordTransactionAbortPrepared(TransactionId xid,
|
|
|
|
int nchildren,
|
|
|
|
TransactionId *children,
|
|
|
|
int nrels,
|
2008-11-19 11:34:52 +01:00
|
|
|
RelFileNode *rels)
|
2005-06-18 00:32:51 +02:00
|
|
|
{
|
|
|
|
XLogRecPtr recptr;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Catch the scenario where we aborted partway through
|
|
|
|
* RecordTransactionCommitPrepared ...
|
|
|
|
*/
|
|
|
|
if (TransactionIdDidCommit(xid))
|
|
|
|
elog(PANIC, "cannot abort transaction %u, it was already committed",
|
|
|
|
xid);
|
|
|
|
|
|
|
|
START_CRIT_SECTION();
|
|
|
|
|
2017-03-22 14:09:36 +01:00
|
|
|
/*
|
|
|
|
* Emit the XLOG commit record. Note that we mark 2PC aborts as potentially
|
|
|
|
* having AccessExclusiveLocks since we don't know whether or not they do.
|
|
|
|
*/
|
Merge the various forms of transaction commit & abort records.
Since 465883b0a two versions of commit records have existed. A compact
version that was used when no cache invalidations, smgr unlinks and
similar were needed, and a full version that could deal with all
that. Additionally the full version was embedded into twophase commit
records.
That resulted in a measurable reduction in the size of the logged WAL in
some workloads. But more recently additions like logical decoding, which
e.g. needs information about the database something was executed on,
made it applicable in fewer situations. The static split generally made
it hard to expand the commit record, because concerns over the size made
it hard to add anything to the compact version.
Additionally it's not particularly pretty to have twophase.c insert
RM_XACT records.
Rejigger things so that the commit and abort records only have one form
each, including the twophase equivalents. The presence of the various
optional (in the sense of not being in every record) pieces is indicated
by a bits in the 'xinfo' flag. That flag previously was not included in
compact commit records. To prevent an increase in size due to its
presence, it's only included if necessary; signalled by a bit in the
xl_info bits available for xact.c, similar to heapam.c's
XLOG_HEAP_OPMASK/XLOG_HEAP_INIT_PAGE.
Twophase commit/aborts are now the same as their normal
counterparts. The original transaction's xid is included in an optional
data field.
This means that commit records generally are smaller, except in the case
of a transaction with subtransactions, but no other special cases; the
increase there is four bytes, which seems acceptable given that the more
common case of not having subtransactions shrank. The savings are
especially measurable for twophase commits, which previously always used
the full version; but will in practice only infrequently have required
that.
The motivation for this work are not the space savings and and
deduplication though; it's that it makes it easier to extend commit
records with additional information. That's just a few lines of code
now; without impacting the common case where that information is not
needed.
Discussion: 20150220152150.GD4149@awork2.anarazel.de,
235610.92468.qm%40web29004.mail.ird.yahoo.com
Reviewed-By: Heikki Linnakangas, Simon Riggs
2015-03-15 17:37:07 +01:00
|
|
|
recptr = XactLogAbortRecord(GetCurrentTimestamp(),
|
|
|
|
nchildren, children,
|
|
|
|
nrels, rels,
|
2017-03-22 14:09:36 +01:00
|
|
|
MyXactFlags | XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK,
|
Merge the various forms of transaction commit & abort records.
Since 465883b0a two versions of commit records have existed. A compact
version that was used when no cache invalidations, smgr unlinks and
similar were needed, and a full version that could deal with all
that. Additionally the full version was embedded into twophase commit
records.
That resulted in a measurable reduction in the size of the logged WAL in
some workloads. But more recently additions like logical decoding, which
e.g. needs information about the database something was executed on,
made it applicable in fewer situations. The static split generally made
it hard to expand the commit record, because concerns over the size made
it hard to add anything to the compact version.
Additionally it's not particularly pretty to have twophase.c insert
RM_XACT records.
Rejigger things so that the commit and abort records only have one form
each, including the twophase equivalents. The presence of the various
optional (in the sense of not being in every record) pieces is indicated
by a bits in the 'xinfo' flag. That flag previously was not included in
compact commit records. To prevent an increase in size due to its
presence, it's only included if necessary; signalled by a bit in the
xl_info bits available for xact.c, similar to heapam.c's
XLOG_HEAP_OPMASK/XLOG_HEAP_INIT_PAGE.
Twophase commit/aborts are now the same as their normal
counterparts. The original transaction's xid is included in an optional
data field.
This means that commit records generally are smaller, except in the case
of a transaction with subtransactions, but no other special cases; the
increase there is four bytes, which seems acceptable given that the more
common case of not having subtransactions shrank. The savings are
especially measurable for twophase commits, which previously always used
the full version; but will in practice only infrequently have required
that.
The motivation for this work are not the space savings and and
deduplication though; it's that it makes it easier to extend commit
records with additional information. That's just a few lines of code
now; without impacting the common case where that information is not
needed.
Discussion: 20150220152150.GD4149@awork2.anarazel.de,
235610.92468.qm%40web29004.mail.ird.yahoo.com
Reviewed-By: Heikki Linnakangas, Simon Riggs
2015-03-15 17:37:07 +01:00
|
|
|
xid);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
/* Always flush, since we're about to remove the 2PC state file */
|
|
|
|
XLogFlush(recptr);
|
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Mark the transaction aborted in clog. This is not absolutely necessary
|
|
|
|
* but we may as well do it while we are here.
|
2005-06-18 00:32:51 +02:00
|
|
|
*/
|
2008-10-20 21:18:18 +02:00
|
|
|
TransactionIdAbortTree(xid, nchildren, children);
|
2005-06-18 00:32:51 +02:00
|
|
|
|
|
|
|
END_CRIT_SECTION();
|
2011-03-06 23:49:16 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Wait for synchronous replication, if required.
|
|
|
|
*
|
2011-04-10 17:42:00 +02:00
|
|
|
* Note that at this stage we have marked clog, but still show as running
|
|
|
|
* in the procarray and continue to hold locks.
|
2011-03-06 23:49:16 +01:00
|
|
|
*/
|
2016-03-30 03:16:12 +02:00
|
|
|
SyncRepWaitForLSN(recptr, false);
|
2005-06-18 00:32:51 +02:00
|
|
|
}
|
2017-04-04 21:56:56 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* PrepareRedoAdd
|
|
|
|
*
|
|
|
|
* Store pointers to the start/end of the WAL record along with the xid in
|
|
|
|
* a gxact entry in shared memory TwoPhaseState structure. If caller
|
|
|
|
* specifies InvalidXLogRecPtr as WAL position to fetch the two-phase
|
|
|
|
* data, the entry is marked as located on disk.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, XLogRecPtr end_lsn)
|
|
|
|
{
|
|
|
|
TwoPhaseFileHeader *hdr = (TwoPhaseFileHeader *) buf;
|
|
|
|
char *bufptr;
|
|
|
|
const char *gid;
|
|
|
|
GlobalTransaction gxact;
|
|
|
|
|
|
|
|
Assert(RecoveryInProgress());
|
|
|
|
|
|
|
|
bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
|
|
|
|
gid = (const char *) bufptr;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reserve the GID for the given transaction in the redo code path.
|
|
|
|
*
|
|
|
|
* This creates a gxact struct and puts it into the active array.
|
|
|
|
*
|
|
|
|
* In redo, this struct is mainly used to track PREPARE/COMMIT entries
|
|
|
|
* in shared memory. Hence, we only fill up the bare minimum contents here.
|
|
|
|
* The gxact also gets marked with gxact->inredo set to true to indicate
|
|
|
|
* that it got added in the redo phase
|
|
|
|
*/
|
|
|
|
|
|
|
|
LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
|
|
|
|
/* Get a free gxact from the freelist */
|
|
|
|
if (TwoPhaseState->freeGXacts == NULL)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_OUT_OF_MEMORY),
|
|
|
|
errmsg("maximum number of prepared transactions reached"),
|
|
|
|
errhint("Increase max_prepared_transactions (currently %d).",
|
|
|
|
max_prepared_xacts)));
|
|
|
|
gxact = TwoPhaseState->freeGXacts;
|
|
|
|
TwoPhaseState->freeGXacts = gxact->next;
|
|
|
|
|
|
|
|
gxact->prepared_at = hdr->prepared_at;
|
|
|
|
gxact->prepare_start_lsn = start_lsn;
|
|
|
|
gxact->prepare_end_lsn = end_lsn;
|
|
|
|
gxact->xid = hdr->xid;
|
|
|
|
gxact->owner = hdr->owner;
|
|
|
|
gxact->locking_backend = InvalidBackendId;
|
|
|
|
gxact->valid = false;
|
|
|
|
gxact->ondisk = XLogRecPtrIsInvalid(start_lsn);
|
|
|
|
gxact->inredo = true; /* yes, added in redo */
|
|
|
|
strcpy(gxact->gid, gid);
|
|
|
|
|
|
|
|
/* And insert it into the active array */
|
|
|
|
Assert(TwoPhaseState->numPrepXacts < max_prepared_xacts);
|
|
|
|
TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts++] = gxact;
|
|
|
|
|
|
|
|
LWLockRelease(TwoPhaseStateLock);
|
|
|
|
|
|
|
|
elog(DEBUG2, "Adding 2PC data to shared memory %u", gxact->xid);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* PrepareRedoRemove
|
|
|
|
*
|
|
|
|
* Remove the corresponding gxact entry from TwoPhaseState. Also
|
|
|
|
* remove the 2PC file if a prepared transaction was saved via
|
|
|
|
* an earlier checkpoint.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
PrepareRedoRemove(TransactionId xid, bool giveWarning)
|
|
|
|
{
|
|
|
|
GlobalTransaction gxact = NULL;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
Assert(RecoveryInProgress());
|
|
|
|
|
|
|
|
LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
|
|
|
|
for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
|
|
|
|
{
|
|
|
|
gxact = TwoPhaseState->prepXacts[i];
|
|
|
|
|
|
|
|
if (gxact->xid == xid)
|
|
|
|
{
|
|
|
|
Assert(gxact->inredo);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
LWLockRelease(TwoPhaseStateLock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Just leave if there is nothing, this is expected during WAL replay.
|
|
|
|
*/
|
|
|
|
if (gxact == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* And now we can clean up any files we may have left.
|
|
|
|
*/
|
|
|
|
elog(DEBUG2, "Removing 2PC data from shared memory %u", xid);
|
|
|
|
if (gxact->ondisk)
|
|
|
|
RemoveTwoPhaseFile(xid, giveWarning);
|
|
|
|
RemoveGXact(gxact);
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|