postgresql/src/backend/utils/adt/xid.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

377 lines
8.3 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* xid.c
* POSTGRES transaction identifier and command identifier datatypes.
*
* Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
2010-09-20 22:08:53 +02:00
* src/backend/utils/adt/xid.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <limits.h>
#include "access/multixact.h"
#include "access/transam.h"
#include "access/xact.h"
#include "common/int.h"
#include "libpq/pqformat.h"
#include "utils/builtins.h"
#include "utils/xid8.h"
#define PG_GETARG_COMMANDID(n) DatumGetCommandId(PG_GETARG_DATUM(n))
#define PG_RETURN_COMMANDID(x) return CommandIdGetDatum(x)
Datum
xidin(PG_FUNCTION_ARGS)
{
char *str = PG_GETARG_CSTRING(0);
TransactionId result;
result = uint32in_subr(str, NULL, "xid", fcinfo->context);
PG_RETURN_TRANSACTIONID(result);
}
Datum
xidout(PG_FUNCTION_ARGS)
{
TransactionId transactionId = PG_GETARG_TRANSACTIONID(0);
char *result = (char *) palloc(16);
snprintf(result, 16, "%lu", (unsigned long) transactionId);
PG_RETURN_CSTRING(result);
}
/*
* xidrecv - converts external binary format to xid
*/
Datum
xidrecv(PG_FUNCTION_ARGS)
{
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
PG_RETURN_TRANSACTIONID((TransactionId) pq_getmsgint(buf, sizeof(TransactionId)));
}
/*
* xidsend - converts xid to binary format
*/
Datum
xidsend(PG_FUNCTION_ARGS)
{
TransactionId arg1 = PG_GETARG_TRANSACTIONID(0);
StringInfoData buf;
pq_begintypsend(&buf);
pq_sendint32(&buf, arg1);
PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
}
/*
* xideq - are two xids equal?
*/
Datum
xideq(PG_FUNCTION_ARGS)
{
TransactionId xid1 = PG_GETARG_TRANSACTIONID(0);
TransactionId xid2 = PG_GETARG_TRANSACTIONID(1);
PG_RETURN_BOOL(TransactionIdEquals(xid1, xid2));
}
/*
* xidneq - are two xids different?
*/
Datum
xidneq(PG_FUNCTION_ARGS)
{
TransactionId xid1 = PG_GETARG_TRANSACTIONID(0);
TransactionId xid2 = PG_GETARG_TRANSACTIONID(1);
PG_RETURN_BOOL(!TransactionIdEquals(xid1, xid2));
}
/*
* xid_age - compute age of an XID (relative to latest stable xid)
*/
Datum
xid_age(PG_FUNCTION_ARGS)
{
TransactionId xid = PG_GETARG_TRANSACTIONID(0);
TransactionId now = GetStableLatestTransactionId();
/* Permanent XIDs are always infinitely old */
if (!TransactionIdIsNormal(xid))
PG_RETURN_INT32(INT_MAX);
PG_RETURN_INT32((int32) (now - xid));
}
/*
* mxid_age - compute age of a multi XID (relative to latest stable mxid)
*/
Datum
mxid_age(PG_FUNCTION_ARGS)
{
TransactionId xid = PG_GETARG_TRANSACTIONID(0);
MultiXactId now = ReadNextMultiXactId();
if (!MultiXactIdIsValid(xid))
PG_RETURN_INT32(INT_MAX);
PG_RETURN_INT32((int32) (now - xid));
}
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
/*
* xidComparator
* qsort comparison function for XIDs
*
* We can't use wraparound comparison for XIDs because that does not respect
* the triangle inequality! Any old sort order will do.
*/
int
xidComparator(const void *arg1, const void *arg2)
{
TransactionId xid1 = *(const TransactionId *) arg1;
TransactionId xid2 = *(const TransactionId *) arg2;
return pg_cmp_u32(xid1, xid2);
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
}
Fix ordering of XIDs in ProcArrayApplyRecoveryInfo Commit 8431e296ea reworked ProcArrayApplyRecoveryInfo to sort XIDs before adding them to KnownAssignedXids. But the XIDs are sorted using xidComparator, which compares the XIDs simply as uint32 values, not logically. KnownAssignedXidsAdd() however expects XIDs in logical order, and calls TransactionIdFollowsOrEquals() to enforce that. If there are XIDs for which the two orderings disagree, an error is raised and the recovery fails/restarts. Hitting this issue is fairly easy - you just need two transactions, one started before the 4B limit (e.g. XID 4294967290), the other sometime after it (e.g. XID 1000). Logically (4294967290 <= 1000) but when compared using xidComparator we try to add them in the opposite order. Which makes KnownAssignedXidsAdd() fail with an error like this: ERROR: out-of-order XID insertion in KnownAssignedXids This only happens during replica startup, while processing RUNNING_XACTS records to build the snapshot. Once we reach STANDBY_SNAPSHOT_READY, we skip these records. So this does not affect already running replicas, but if you restart (or create) a replica while there are transactions with XIDs for which the two orderings disagree, you may hit this. Long-running transactions and frequent replica restarts increase the likelihood of hitting this issue. Once the replica gets into this state, it can't be started (even if the old transactions are terminated). Fixed by sorting the XIDs logically - this is fine because we're dealing with normal XIDs (because it's XIDs assigned to backends) and from the same wraparound epoch (otherwise the backends could not be running at the same time on the primary node). So there are no problems with the triangle inequality, which is why xidComparator compares raw values. Investigation and root cause analysis by Abhijit Menon-Sen. Patch by me. This issue is present in all releases since 9.4, however releases up to 9.6 are EOL already so backpatch to 10 only. Reviewed-by: Abhijit Menon-Sen Reviewed-by: Alvaro Herrera Backpatch-through: 10 Discussion: https://postgr.es/m/36b8a501-5d73-277c-4972-f58a4dce088a%40enterprisedb.com
2022-01-27 17:53:53 +01:00
/*
* xidLogicalComparator
* qsort comparison function for XIDs
*
* This is used to compare only XIDs from the same epoch (e.g. for backends
* running at the same time). So there must be only normal XIDs, so there's
* no issue with triangle inequality.
*/
int
xidLogicalComparator(const void *arg1, const void *arg2)
{
TransactionId xid1 = *(const TransactionId *) arg1;
TransactionId xid2 = *(const TransactionId *) arg2;
Assert(TransactionIdIsNormal(xid1));
Assert(TransactionIdIsNormal(xid2));
if (TransactionIdPrecedes(xid1, xid2))
return -1;
if (TransactionIdPrecedes(xid2, xid1))
return 1;
return 0;
}
Datum
xid8toxid(PG_FUNCTION_ARGS)
{
FullTransactionId fxid = PG_GETARG_FULLTRANSACTIONID(0);
PG_RETURN_TRANSACTIONID(XidFromFullTransactionId(fxid));
}
Datum
xid8in(PG_FUNCTION_ARGS)
{
char *str = PG_GETARG_CSTRING(0);
uint64 result;
result = uint64in_subr(str, NULL, "xid8", fcinfo->context);
PG_RETURN_FULLTRANSACTIONID(FullTransactionIdFromU64(result));
}
Datum
xid8out(PG_FUNCTION_ARGS)
{
FullTransactionId fxid = PG_GETARG_FULLTRANSACTIONID(0);
char *result = (char *) palloc(21);
snprintf(result, 21, UINT64_FORMAT, U64FromFullTransactionId(fxid));
PG_RETURN_CSTRING(result);
}
Datum
xid8recv(PG_FUNCTION_ARGS)
{
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
uint64 value;
value = (uint64) pq_getmsgint64(buf);
PG_RETURN_FULLTRANSACTIONID(FullTransactionIdFromU64(value));
}
Datum
xid8send(PG_FUNCTION_ARGS)
{
FullTransactionId arg1 = PG_GETARG_FULLTRANSACTIONID(0);
StringInfoData buf;
pq_begintypsend(&buf);
pq_sendint64(&buf, (uint64) U64FromFullTransactionId(arg1));
PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
}
Datum
xid8eq(PG_FUNCTION_ARGS)
{
FullTransactionId fxid1 = PG_GETARG_FULLTRANSACTIONID(0);
FullTransactionId fxid2 = PG_GETARG_FULLTRANSACTIONID(1);
PG_RETURN_BOOL(FullTransactionIdEquals(fxid1, fxid2));
}
Datum
xid8ne(PG_FUNCTION_ARGS)
{
FullTransactionId fxid1 = PG_GETARG_FULLTRANSACTIONID(0);
FullTransactionId fxid2 = PG_GETARG_FULLTRANSACTIONID(1);
PG_RETURN_BOOL(!FullTransactionIdEquals(fxid1, fxid2));
}
Datum
xid8lt(PG_FUNCTION_ARGS)
{
FullTransactionId fxid1 = PG_GETARG_FULLTRANSACTIONID(0);
FullTransactionId fxid2 = PG_GETARG_FULLTRANSACTIONID(1);
PG_RETURN_BOOL(FullTransactionIdPrecedes(fxid1, fxid2));
}
Datum
xid8gt(PG_FUNCTION_ARGS)
{
FullTransactionId fxid1 = PG_GETARG_FULLTRANSACTIONID(0);
FullTransactionId fxid2 = PG_GETARG_FULLTRANSACTIONID(1);
PG_RETURN_BOOL(FullTransactionIdFollows(fxid1, fxid2));
}
Datum
xid8le(PG_FUNCTION_ARGS)
{
FullTransactionId fxid1 = PG_GETARG_FULLTRANSACTIONID(0);
FullTransactionId fxid2 = PG_GETARG_FULLTRANSACTIONID(1);
PG_RETURN_BOOL(FullTransactionIdPrecedesOrEquals(fxid1, fxid2));
}
Datum
xid8ge(PG_FUNCTION_ARGS)
{
FullTransactionId fxid1 = PG_GETARG_FULLTRANSACTIONID(0);
FullTransactionId fxid2 = PG_GETARG_FULLTRANSACTIONID(1);
PG_RETURN_BOOL(FullTransactionIdFollowsOrEquals(fxid1, fxid2));
}
Datum
xid8cmp(PG_FUNCTION_ARGS)
{
FullTransactionId fxid1 = PG_GETARG_FULLTRANSACTIONID(0);
FullTransactionId fxid2 = PG_GETARG_FULLTRANSACTIONID(1);
if (FullTransactionIdFollows(fxid1, fxid2))
PG_RETURN_INT32(1);
else if (FullTransactionIdEquals(fxid1, fxid2))
PG_RETURN_INT32(0);
else
PG_RETURN_INT32(-1);
}
Datum
xid8_larger(PG_FUNCTION_ARGS)
{
FullTransactionId fxid1 = PG_GETARG_FULLTRANSACTIONID(0);
FullTransactionId fxid2 = PG_GETARG_FULLTRANSACTIONID(1);
if (FullTransactionIdFollows(fxid1, fxid2))
PG_RETURN_FULLTRANSACTIONID(fxid1);
else
PG_RETURN_FULLTRANSACTIONID(fxid2);
}
Datum
xid8_smaller(PG_FUNCTION_ARGS)
{
FullTransactionId fxid1 = PG_GETARG_FULLTRANSACTIONID(0);
FullTransactionId fxid2 = PG_GETARG_FULLTRANSACTIONID(1);
if (FullTransactionIdPrecedes(fxid1, fxid2))
PG_RETURN_FULLTRANSACTIONID(fxid1);
else
PG_RETURN_FULLTRANSACTIONID(fxid2);
}
/*****************************************************************************
* COMMAND IDENTIFIER ROUTINES *
*****************************************************************************/
/*
* cidin - converts CommandId to internal representation.
*/
Datum
cidin(PG_FUNCTION_ARGS)
{
char *str = PG_GETARG_CSTRING(0);
CommandId result;
result = uint32in_subr(str, NULL, "cid", fcinfo->context);
PG_RETURN_COMMANDID(result);
}
/*
* cidout - converts a cid to external representation.
*/
Datum
cidout(PG_FUNCTION_ARGS)
{
CommandId c = PG_GETARG_COMMANDID(0);
char *result = (char *) palloc(16);
snprintf(result, 16, "%lu", (unsigned long) c);
PG_RETURN_CSTRING(result);
}
/*
* cidrecv - converts external binary format to cid
*/
Datum
cidrecv(PG_FUNCTION_ARGS)
{
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
PG_RETURN_COMMANDID((CommandId) pq_getmsgint(buf, sizeof(CommandId)));
}
/*
* cidsend - converts cid to binary format
*/
Datum
cidsend(PG_FUNCTION_ARGS)
{
CommandId arg1 = PG_GETARG_COMMANDID(0);
StringInfoData buf;
pq_begintypsend(&buf);
pq_sendint32(&buf, arg1);
PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
}
Datum
cideq(PG_FUNCTION_ARGS)
{
CommandId arg1 = PG_GETARG_COMMANDID(0);
CommandId arg2 = PG_GETARG_COMMANDID(1);
PG_RETURN_BOOL(arg1 == arg2);
}