2012-08-29 01:02:00 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* heapam_xlog.h
|
|
|
|
* POSTGRES heap access XLOG definitions.
|
|
|
|
*
|
|
|
|
*
|
2014-01-07 22:05:30 +01:00
|
|
|
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
|
2012-08-29 01:02:00 +02:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
|
|
*
|
|
|
|
* src/include/access/heapam_xlog.h
|
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#ifndef HEAPAM_XLOG_H
|
|
|
|
#define HEAPAM_XLOG_H
|
|
|
|
|
|
|
|
#include "access/htup.h"
|
|
|
|
#include "access/xlog.h"
|
2012-08-30 22:15:44 +02:00
|
|
|
#include "storage/bufpage.h"
|
2012-08-29 01:02:00 +02:00
|
|
|
#include "storage/relfilenode.h"
|
|
|
|
#include "utils/relcache.h"
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* WAL record definitions for heapam.c's WAL operations
|
|
|
|
*
|
|
|
|
* XLOG allows to store some information in high 4 bits of log
|
|
|
|
* record xl_info field. We use 3 for opcode and one for init bit.
|
|
|
|
*/
|
|
|
|
#define XLOG_HEAP_INSERT 0x00
|
|
|
|
#define XLOG_HEAP_DELETE 0x10
|
|
|
|
#define XLOG_HEAP_UPDATE 0x20
|
|
|
|
/* 0x030 is free, was XLOG_HEAP_MOVE */
|
|
|
|
#define XLOG_HEAP_HOT_UPDATE 0x40
|
|
|
|
#define XLOG_HEAP_NEWPAGE 0x50
|
|
|
|
#define XLOG_HEAP_LOCK 0x60
|
|
|
|
#define XLOG_HEAP_INPLACE 0x70
|
|
|
|
|
|
|
|
#define XLOG_HEAP_OPMASK 0x70
|
|
|
|
/*
|
|
|
|
* When we insert 1st item on new page in INSERT, UPDATE, HOT_UPDATE,
|
|
|
|
* or MULTI_INSERT, we can (and we do) restore entire page in redo
|
|
|
|
*/
|
|
|
|
#define XLOG_HEAP_INIT_PAGE 0x80
|
|
|
|
/*
|
|
|
|
* We ran out of opcodes, so heapam.c now has a second RmgrId. These opcodes
|
|
|
|
* are associated with RM_HEAP2_ID, but are not logically different from
|
|
|
|
* the ones above associated with RM_HEAP_ID. XLOG_HEAP_OPMASK applies to
|
|
|
|
* these, too.
|
|
|
|
*/
|
Introduce logical decoding.
This feature, building on previous commits, allows the write-ahead log
stream to be decoded into a series of logical changes; that is,
inserts, updates, and deletes and the transactions which contain them.
It is capable of handling decoding even across changes to the schema
of the effected tables. The output format is controlled by a
so-called "output plugin"; an example is included. To make use of
this in a real replication system, the output plugin will need to be
modified to produce output in the format appropriate to that system,
and to perform filtering.
Currently, information can be extracted from the logical decoding
system only via SQL; future commits will add the ability to stream
changes via walsender.
Andres Freund, with review and other contributions from many other
people, including Álvaro Herrera, Abhijit Menon-Sen, Peter Gheogegan,
Kevin Grittner, Robert Haas, Heikki Linnakangas, Fujii Masao, Abhijit
Menon-Sen, Michael Paquier, Simon Riggs, Craig Ringer, and Steve
Singer.
2014-03-03 22:32:18 +01:00
|
|
|
#define XLOG_HEAP2_REWRITE 0x00
|
2012-08-29 01:02:00 +02:00
|
|
|
#define XLOG_HEAP2_CLEAN 0x10
|
Rework tuple freezing protocol
Tuple freezing was broken in connection to MultiXactIds; commit
8e53ae025de9 tried to fix it, but didn't go far enough. As noted by
Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted
update might cause locks in the multi to go ignored by later
transactions. This is because the code depended on a multixact above
their cutoff point not having any lock-only member older than the cutoff
point for Xids, which is easily defeated in READ COMMITTED transactions.
The fix for this involves creating a new MultiXactId when necessary.
But this cannot be done during WAL replay, and moreover multixact
examination requires using CLOG access routines which are not supposed
to be used during WAL replay either; so tuple freezing cannot be done
with the old freeze WAL record. Therefore, separate the freezing
computation from its execution, and change the WAL record to carry all
necessary information. At WAL replay time, it's easy to re-execute
freezing because we don't need to re-compute the new infomask/Xmax
values but just take them from the WAL record.
While at it, restructure the coding to ensure all page changes occur in
a single critical section without much room for failures. The previous
coding wasn't using a critical section, without any explanation as to
why this was acceptable.
In replication scenarios using the 9.3 branch, standby servers must be
upgraded before their master, so that they are prepared to deal with the
new WAL record once the master is upgraded; failure to do so will cause
WAL replay to die with a PANIC message. Later upgrade of the standby
will allow the process to continue where it left off, so there's no
disruption of the data in the standby in any case. Standbys know how to
deal with the old WAL record, so it's okay to keep the master running
the old code for a while.
In master, the old freeze WAL record is gone, for cleanliness' sake;
there's no compatibility concern there.
Backpatch to 9.3, where the original bug was introduced and where the
previous fix was backpatched.
Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
|
|
|
#define XLOG_HEAP2_FREEZE_PAGE 0x20
|
2012-08-29 01:02:00 +02:00
|
|
|
#define XLOG_HEAP2_CLEANUP_INFO 0x30
|
|
|
|
#define XLOG_HEAP2_VISIBLE 0x40
|
|
|
|
#define XLOG_HEAP2_MULTI_INSERT 0x50
|
Improve concurrency of foreign key locking
This patch introduces two additional lock modes for tuples: "SELECT FOR
KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each
other, in contrast with already existing "SELECT FOR SHARE" and "SELECT
FOR UPDATE". UPDATE commands that do not modify the values stored in
the columns that are part of the key of the tuple now grab a SELECT FOR
NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently
with tuple locks of the FOR KEY SHARE variety.
Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this
means the concurrency improvement applies to them, which is the whole
point of this patch.
The added tuple lock semantics require some rejiggering of the multixact
module, so that the locking level that each transaction is holding can
be stored alongside its Xid. Also, multixacts now need to persist
across server restarts and crashes, because they can now represent not
only tuple locks, but also tuple updates. This means we need more
careful tracking of lifetime of pg_multixact SLRU files; since they now
persist longer, we require more infrastructure to figure out when they
can be removed. pg_upgrade also needs to be careful to copy
pg_multixact files over from the old server to the new, or at least part
of multixact.c state, depending on the versions of the old and new
servers.
Tuple time qualification rules (HeapTupleSatisfies routines) need to be
careful not to consider tuples with the "is multi" infomask bit set as
being only locked; they might need to look up MultiXact values (i.e.
possibly do pg_multixact I/O) to find out the Xid that updated a tuple,
whereas they previously were assured to only use information readily
available from the tuple header. This is considered acceptable, because
the extra I/O would involve cases that would previously cause some
commands to block waiting for concurrent transactions to finish.
Another important change is the fact that locking tuples that have
previously been updated causes the future versions to be marked as
locked, too; this is essential for correctness of foreign key checks.
This causes additional WAL-logging, also (there was previously a single
WAL record for a locked tuple; now there are as many as updated copies
of the tuple there exist.)
With all this in place, contention related to tuples being checked by
foreign key rules should be much reduced.
As a bonus, the old behavior that a subtransaction grabbing a stronger
tuple lock than the parent (sub)transaction held on a given tuple and
later aborting caused the weaker lock to be lost, has been fixed.
Many new spec files were added for isolation tester framework, to ensure
overall behavior is sane. There's probably room for several more tests.
There were several reviewers of this patch; in particular, Noah Misch
and Andres Freund spent considerable time in it. Original idea for the
patch came from Simon Riggs, after a problem report by Joel Jacobson.
Most code is from me, with contributions from Marti Raudsepp, Alexander
Shulgin, Noah Misch and Andres Freund.
This patch was discussed in several pgsql-hackers threads; the most
important start at the following message-ids:
AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com
1290721684-sup-3951@alvh.no-ip.org
1294953201-sup-2099@alvh.no-ip.org
1320343602-sup-2290@alvh.no-ip.org
1339690386-sup-8927@alvh.no-ip.org
4FE5FF020200002500048A3D@gw.wicourts.gov
4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
|
|
|
#define XLOG_HEAP2_LOCK_UPDATED 0x60
|
Add new wal_level, logical, sufficient for logical decoding.
When wal_level=logical, we'll log columns from the old tuple as
configured by the REPLICA IDENTITY facility added in commit
07cacba983ef79be4a84fcd0e0ca3b5fcb85dd65. This makes it possible
a properly-configured logical replication solution to correctly
follow table updates even if they change the chosen key columns,
or, with REPLICA IDENTITY FULL, even if the table has no key at
all. Note that updates which do not modify the replica identity
column won't log anything extra, making the choice of a good key
(i.e. one that will rarely be changed) important to performance
when wal_level=logical is configured.
Each insert, update, or delete to a catalog table will also log
the CMIN and/or CMAX values of stamped by the current transaction.
This is necessary because logical decoding will require access to
historical snapshots of the catalog in order to decode some data
types, and the CMIN/CMAX values that we may need in order to judge
row visibility may have been overwritten by the time we need them.
Andres Freund, reviewed in various versions by myself, Heikki
Linnakangas, KONDO Mitsumasa, and many others.
2013-12-11 00:33:45 +01:00
|
|
|
#define XLOG_HEAP2_NEW_CID 0x70
|
|
|
|
|
|
|
|
/*
|
|
|
|
* xl_heap_* ->flag values, 8 bits are available.
|
|
|
|
*/
|
|
|
|
/* PD_ALL_VISIBLE was cleared */
|
|
|
|
#define XLOG_HEAP_ALL_VISIBLE_CLEARED (1<<0)
|
|
|
|
/* PD_ALL_VISIBLE was cleared in the 2nd page */
|
|
|
|
#define XLOG_HEAP_NEW_ALL_VISIBLE_CLEARED (1<<1)
|
|
|
|
#define XLOG_HEAP_CONTAINS_OLD_TUPLE (1<<2)
|
|
|
|
#define XLOG_HEAP_CONTAINS_OLD_KEY (1<<3)
|
|
|
|
#define XLOG_HEAP_CONTAINS_NEW_TUPLE (1<<4)
|
2014-03-12 21:46:04 +01:00
|
|
|
#define XLOG_HEAP_PREFIX_FROM_OLD (1<<5)
|
|
|
|
#define XLOG_HEAP_SUFFIX_FROM_OLD (1<<6)
|
Add new wal_level, logical, sufficient for logical decoding.
When wal_level=logical, we'll log columns from the old tuple as
configured by the REPLICA IDENTITY facility added in commit
07cacba983ef79be4a84fcd0e0ca3b5fcb85dd65. This makes it possible
a properly-configured logical replication solution to correctly
follow table updates even if they change the chosen key columns,
or, with REPLICA IDENTITY FULL, even if the table has no key at
all. Note that updates which do not modify the replica identity
column won't log anything extra, making the choice of a good key
(i.e. one that will rarely be changed) important to performance
when wal_level=logical is configured.
Each insert, update, or delete to a catalog table will also log
the CMIN and/or CMAX values of stamped by the current transaction.
This is necessary because logical decoding will require access to
historical snapshots of the catalog in order to decode some data
types, and the CMIN/CMAX values that we may need in order to judge
row visibility may have been overwritten by the time we need them.
Andres Freund, reviewed in various versions by myself, Heikki
Linnakangas, KONDO Mitsumasa, and many others.
2013-12-11 00:33:45 +01:00
|
|
|
|
|
|
|
/* convenience macro for checking whether any form of old tuple was logged */
|
|
|
|
#define XLOG_HEAP_CONTAINS_OLD \
|
|
|
|
(XLOG_HEAP_CONTAINS_OLD_TUPLE | XLOG_HEAP_CONTAINS_OLD_KEY)
|
2012-08-29 01:02:00 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* All what we need to find changed tuple
|
|
|
|
*
|
|
|
|
* NB: on most machines, sizeof(xl_heaptid) will include some trailing pad
|
|
|
|
* bytes for alignment. We don't want to store the pad space in the XLOG,
|
|
|
|
* so use SizeOfHeapTid for space calculations. Similar comments apply for
|
|
|
|
* the other xl_FOO structs.
|
|
|
|
*/
|
|
|
|
typedef struct xl_heaptid
|
|
|
|
{
|
|
|
|
RelFileNode node;
|
|
|
|
ItemPointerData tid; /* changed tuple id */
|
|
|
|
} xl_heaptid;
|
|
|
|
|
|
|
|
#define SizeOfHeapTid (offsetof(xl_heaptid, tid) + SizeOfIptrData)
|
|
|
|
|
|
|
|
/* This is what we need to know about delete */
|
|
|
|
typedef struct xl_heap_delete
|
|
|
|
{
|
|
|
|
xl_heaptid target; /* deleted tuple id */
|
Improve concurrency of foreign key locking
This patch introduces two additional lock modes for tuples: "SELECT FOR
KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each
other, in contrast with already existing "SELECT FOR SHARE" and "SELECT
FOR UPDATE". UPDATE commands that do not modify the values stored in
the columns that are part of the key of the tuple now grab a SELECT FOR
NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently
with tuple locks of the FOR KEY SHARE variety.
Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this
means the concurrency improvement applies to them, which is the whole
point of this patch.
The added tuple lock semantics require some rejiggering of the multixact
module, so that the locking level that each transaction is holding can
be stored alongside its Xid. Also, multixacts now need to persist
across server restarts and crashes, because they can now represent not
only tuple locks, but also tuple updates. This means we need more
careful tracking of lifetime of pg_multixact SLRU files; since they now
persist longer, we require more infrastructure to figure out when they
can be removed. pg_upgrade also needs to be careful to copy
pg_multixact files over from the old server to the new, or at least part
of multixact.c state, depending on the versions of the old and new
servers.
Tuple time qualification rules (HeapTupleSatisfies routines) need to be
careful not to consider tuples with the "is multi" infomask bit set as
being only locked; they might need to look up MultiXact values (i.e.
possibly do pg_multixact I/O) to find out the Xid that updated a tuple,
whereas they previously were assured to only use information readily
available from the tuple header. This is considered acceptable, because
the extra I/O would involve cases that would previously cause some
commands to block waiting for concurrent transactions to finish.
Another important change is the fact that locking tuples that have
previously been updated causes the future versions to be marked as
locked, too; this is essential for correctness of foreign key checks.
This causes additional WAL-logging, also (there was previously a single
WAL record for a locked tuple; now there are as many as updated copies
of the tuple there exist.)
With all this in place, contention related to tuples being checked by
foreign key rules should be much reduced.
As a bonus, the old behavior that a subtransaction grabbing a stronger
tuple lock than the parent (sub)transaction held on a given tuple and
later aborting caused the weaker lock to be lost, has been fixed.
Many new spec files were added for isolation tester framework, to ensure
overall behavior is sane. There's probably room for several more tests.
There were several reviewers of this patch; in particular, Noah Misch
and Andres Freund spent considerable time in it. Original idea for the
patch came from Simon Riggs, after a problem report by Joel Jacobson.
Most code is from me, with contributions from Marti Raudsepp, Alexander
Shulgin, Noah Misch and Andres Freund.
This patch was discussed in several pgsql-hackers threads; the most
important start at the following message-ids:
AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com
1290721684-sup-3951@alvh.no-ip.org
1294953201-sup-2099@alvh.no-ip.org
1320343602-sup-2290@alvh.no-ip.org
1339690386-sup-8927@alvh.no-ip.org
4FE5FF020200002500048A3D@gw.wicourts.gov
4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
|
|
|
TransactionId xmax; /* xmax of the deleted tuple */
|
|
|
|
uint8 infobits_set; /* infomask bits */
|
Add new wal_level, logical, sufficient for logical decoding.
When wal_level=logical, we'll log columns from the old tuple as
configured by the REPLICA IDENTITY facility added in commit
07cacba983ef79be4a84fcd0e0ca3b5fcb85dd65. This makes it possible
a properly-configured logical replication solution to correctly
follow table updates even if they change the chosen key columns,
or, with REPLICA IDENTITY FULL, even if the table has no key at
all. Note that updates which do not modify the replica identity
column won't log anything extra, making the choice of a good key
(i.e. one that will rarely be changed) important to performance
when wal_level=logical is configured.
Each insert, update, or delete to a catalog table will also log
the CMIN and/or CMAX values of stamped by the current transaction.
This is necessary because logical decoding will require access to
historical snapshots of the catalog in order to decode some data
types, and the CMIN/CMAX values that we may need in order to judge
row visibility may have been overwritten by the time we need them.
Andres Freund, reviewed in various versions by myself, Heikki
Linnakangas, KONDO Mitsumasa, and many others.
2013-12-11 00:33:45 +01:00
|
|
|
uint8 flags;
|
2012-08-29 01:02:00 +02:00
|
|
|
} xl_heap_delete;
|
|
|
|
|
Add new wal_level, logical, sufficient for logical decoding.
When wal_level=logical, we'll log columns from the old tuple as
configured by the REPLICA IDENTITY facility added in commit
07cacba983ef79be4a84fcd0e0ca3b5fcb85dd65. This makes it possible
a properly-configured logical replication solution to correctly
follow table updates even if they change the chosen key columns,
or, with REPLICA IDENTITY FULL, even if the table has no key at
all. Note that updates which do not modify the replica identity
column won't log anything extra, making the choice of a good key
(i.e. one that will rarely be changed) important to performance
when wal_level=logical is configured.
Each insert, update, or delete to a catalog table will also log
the CMIN and/or CMAX values of stamped by the current transaction.
This is necessary because logical decoding will require access to
historical snapshots of the catalog in order to decode some data
types, and the CMIN/CMAX values that we may need in order to judge
row visibility may have been overwritten by the time we need them.
Andres Freund, reviewed in various versions by myself, Heikki
Linnakangas, KONDO Mitsumasa, and many others.
2013-12-11 00:33:45 +01:00
|
|
|
#define SizeOfHeapDelete (offsetof(xl_heap_delete, flags) + sizeof(uint8))
|
2012-08-29 01:02:00 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't store the whole fixed part (HeapTupleHeaderData) of an inserted
|
|
|
|
* or updated tuple in WAL; we can save a few bytes by reconstructing the
|
|
|
|
* fields that are available elsewhere in the WAL record, or perhaps just
|
|
|
|
* plain needn't be reconstructed. These are the fields we must store.
|
|
|
|
* NOTE: t_hoff could be recomputed, but we may as well store it because
|
|
|
|
* it will come for free due to alignment considerations.
|
|
|
|
*/
|
|
|
|
typedef struct xl_heap_header
|
|
|
|
{
|
|
|
|
uint16 t_infomask2;
|
|
|
|
uint16 t_infomask;
|
|
|
|
uint8 t_hoff;
|
|
|
|
} xl_heap_header;
|
|
|
|
|
|
|
|
#define SizeOfHeapHeader (offsetof(xl_heap_header, t_hoff) + sizeof(uint8))
|
|
|
|
|
Add new wal_level, logical, sufficient for logical decoding.
When wal_level=logical, we'll log columns from the old tuple as
configured by the REPLICA IDENTITY facility added in commit
07cacba983ef79be4a84fcd0e0ca3b5fcb85dd65. This makes it possible
a properly-configured logical replication solution to correctly
follow table updates even if they change the chosen key columns,
or, with REPLICA IDENTITY FULL, even if the table has no key at
all. Note that updates which do not modify the replica identity
column won't log anything extra, making the choice of a good key
(i.e. one that will rarely be changed) important to performance
when wal_level=logical is configured.
Each insert, update, or delete to a catalog table will also log
the CMIN and/or CMAX values of stamped by the current transaction.
This is necessary because logical decoding will require access to
historical snapshots of the catalog in order to decode some data
types, and the CMIN/CMAX values that we may need in order to judge
row visibility may have been overwritten by the time we need them.
Andres Freund, reviewed in various versions by myself, Heikki
Linnakangas, KONDO Mitsumasa, and many others.
2013-12-11 00:33:45 +01:00
|
|
|
/*
|
|
|
|
* Variant of xl_heap_header that contains the length of the tuple, which is
|
|
|
|
* useful if the length of the tuple cannot be computed using the overall
|
|
|
|
* record length. E.g. because there are several tuples inside a single
|
|
|
|
* record.
|
|
|
|
*/
|
|
|
|
typedef struct xl_heap_header_len
|
|
|
|
{
|
|
|
|
uint16 t_len;
|
|
|
|
xl_heap_header header;
|
|
|
|
} xl_heap_header_len;
|
|
|
|
|
|
|
|
#define SizeOfHeapHeaderLen (offsetof(xl_heap_header_len, header) + SizeOfHeapHeader)
|
|
|
|
|
2012-08-29 01:02:00 +02:00
|
|
|
/* This is what we need to know about insert */
|
|
|
|
typedef struct xl_heap_insert
|
|
|
|
{
|
|
|
|
xl_heaptid target; /* inserted tuple id */
|
Add new wal_level, logical, sufficient for logical decoding.
When wal_level=logical, we'll log columns from the old tuple as
configured by the REPLICA IDENTITY facility added in commit
07cacba983ef79be4a84fcd0e0ca3b5fcb85dd65. This makes it possible
a properly-configured logical replication solution to correctly
follow table updates even if they change the chosen key columns,
or, with REPLICA IDENTITY FULL, even if the table has no key at
all. Note that updates which do not modify the replica identity
column won't log anything extra, making the choice of a good key
(i.e. one that will rarely be changed) important to performance
when wal_level=logical is configured.
Each insert, update, or delete to a catalog table will also log
the CMIN and/or CMAX values of stamped by the current transaction.
This is necessary because logical decoding will require access to
historical snapshots of the catalog in order to decode some data
types, and the CMIN/CMAX values that we may need in order to judge
row visibility may have been overwritten by the time we need them.
Andres Freund, reviewed in various versions by myself, Heikki
Linnakangas, KONDO Mitsumasa, and many others.
2013-12-11 00:33:45 +01:00
|
|
|
uint8 flags;
|
2012-08-29 01:02:00 +02:00
|
|
|
/* xl_heap_header & TUPLE DATA FOLLOWS AT END OF STRUCT */
|
|
|
|
} xl_heap_insert;
|
|
|
|
|
Add new wal_level, logical, sufficient for logical decoding.
When wal_level=logical, we'll log columns from the old tuple as
configured by the REPLICA IDENTITY facility added in commit
07cacba983ef79be4a84fcd0e0ca3b5fcb85dd65. This makes it possible
a properly-configured logical replication solution to correctly
follow table updates even if they change the chosen key columns,
or, with REPLICA IDENTITY FULL, even if the table has no key at
all. Note that updates which do not modify the replica identity
column won't log anything extra, making the choice of a good key
(i.e. one that will rarely be changed) important to performance
when wal_level=logical is configured.
Each insert, update, or delete to a catalog table will also log
the CMIN and/or CMAX values of stamped by the current transaction.
This is necessary because logical decoding will require access to
historical snapshots of the catalog in order to decode some data
types, and the CMIN/CMAX values that we may need in order to judge
row visibility may have been overwritten by the time we need them.
Andres Freund, reviewed in various versions by myself, Heikki
Linnakangas, KONDO Mitsumasa, and many others.
2013-12-11 00:33:45 +01:00
|
|
|
#define SizeOfHeapInsert (offsetof(xl_heap_insert, flags) + sizeof(uint8))
|
2012-08-29 01:02:00 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This is what we need to know about a multi-insert. The record consists of
|
|
|
|
* xl_heap_multi_insert header, followed by a xl_multi_insert_tuple and tuple
|
|
|
|
* data for each tuple. 'offsets' array is omitted if the whole page is
|
|
|
|
* reinitialized (XLOG_HEAP_INIT_PAGE)
|
|
|
|
*/
|
|
|
|
typedef struct xl_heap_multi_insert
|
|
|
|
{
|
|
|
|
RelFileNode node;
|
|
|
|
BlockNumber blkno;
|
Add new wal_level, logical, sufficient for logical decoding.
When wal_level=logical, we'll log columns from the old tuple as
configured by the REPLICA IDENTITY facility added in commit
07cacba983ef79be4a84fcd0e0ca3b5fcb85dd65. This makes it possible
a properly-configured logical replication solution to correctly
follow table updates even if they change the chosen key columns,
or, with REPLICA IDENTITY FULL, even if the table has no key at
all. Note that updates which do not modify the replica identity
column won't log anything extra, making the choice of a good key
(i.e. one that will rarely be changed) important to performance
when wal_level=logical is configured.
Each insert, update, or delete to a catalog table will also log
the CMIN and/or CMAX values of stamped by the current transaction.
This is necessary because logical decoding will require access to
historical snapshots of the catalog in order to decode some data
types, and the CMIN/CMAX values that we may need in order to judge
row visibility may have been overwritten by the time we need them.
Andres Freund, reviewed in various versions by myself, Heikki
Linnakangas, KONDO Mitsumasa, and many others.
2013-12-11 00:33:45 +01:00
|
|
|
uint8 flags;
|
2012-08-29 01:02:00 +02:00
|
|
|
uint16 ntuples;
|
|
|
|
OffsetNumber offsets[1];
|
|
|
|
|
|
|
|
/* TUPLE DATA (xl_multi_insert_tuples) FOLLOW AT END OF STRUCT */
|
|
|
|
} xl_heap_multi_insert;
|
|
|
|
|
|
|
|
#define SizeOfHeapMultiInsert offsetof(xl_heap_multi_insert, offsets)
|
|
|
|
|
|
|
|
typedef struct xl_multi_insert_tuple
|
|
|
|
{
|
|
|
|
uint16 datalen; /* size of tuple data that follows */
|
|
|
|
uint16 t_infomask2;
|
|
|
|
uint16 t_infomask;
|
|
|
|
uint8 t_hoff;
|
|
|
|
/* TUPLE DATA FOLLOWS AT END OF STRUCT */
|
|
|
|
} xl_multi_insert_tuple;
|
|
|
|
|
|
|
|
#define SizeOfMultiInsertTuple (offsetof(xl_multi_insert_tuple, t_hoff) + sizeof(uint8))
|
|
|
|
|
|
|
|
/* This is what we need to know about update|hot_update */
|
|
|
|
typedef struct xl_heap_update
|
|
|
|
{
|
|
|
|
xl_heaptid target; /* deleted tuple id */
|
Improve concurrency of foreign key locking
This patch introduces two additional lock modes for tuples: "SELECT FOR
KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each
other, in contrast with already existing "SELECT FOR SHARE" and "SELECT
FOR UPDATE". UPDATE commands that do not modify the values stored in
the columns that are part of the key of the tuple now grab a SELECT FOR
NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently
with tuple locks of the FOR KEY SHARE variety.
Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this
means the concurrency improvement applies to them, which is the whole
point of this patch.
The added tuple lock semantics require some rejiggering of the multixact
module, so that the locking level that each transaction is holding can
be stored alongside its Xid. Also, multixacts now need to persist
across server restarts and crashes, because they can now represent not
only tuple locks, but also tuple updates. This means we need more
careful tracking of lifetime of pg_multixact SLRU files; since they now
persist longer, we require more infrastructure to figure out when they
can be removed. pg_upgrade also needs to be careful to copy
pg_multixact files over from the old server to the new, or at least part
of multixact.c state, depending on the versions of the old and new
servers.
Tuple time qualification rules (HeapTupleSatisfies routines) need to be
careful not to consider tuples with the "is multi" infomask bit set as
being only locked; they might need to look up MultiXact values (i.e.
possibly do pg_multixact I/O) to find out the Xid that updated a tuple,
whereas they previously were assured to only use information readily
available from the tuple header. This is considered acceptable, because
the extra I/O would involve cases that would previously cause some
commands to block waiting for concurrent transactions to finish.
Another important change is the fact that locking tuples that have
previously been updated causes the future versions to be marked as
locked, too; this is essential for correctness of foreign key checks.
This causes additional WAL-logging, also (there was previously a single
WAL record for a locked tuple; now there are as many as updated copies
of the tuple there exist.)
With all this in place, contention related to tuples being checked by
foreign key rules should be much reduced.
As a bonus, the old behavior that a subtransaction grabbing a stronger
tuple lock than the parent (sub)transaction held on a given tuple and
later aborting caused the weaker lock to be lost, has been fixed.
Many new spec files were added for isolation tester framework, to ensure
overall behavior is sane. There's probably room for several more tests.
There were several reviewers of this patch; in particular, Noah Misch
and Andres Freund spent considerable time in it. Original idea for the
patch came from Simon Riggs, after a problem report by Joel Jacobson.
Most code is from me, with contributions from Marti Raudsepp, Alexander
Shulgin, Noah Misch and Andres Freund.
This patch was discussed in several pgsql-hackers threads; the most
important start at the following message-ids:
AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com
1290721684-sup-3951@alvh.no-ip.org
1294953201-sup-2099@alvh.no-ip.org
1320343602-sup-2290@alvh.no-ip.org
1339690386-sup-8927@alvh.no-ip.org
4FE5FF020200002500048A3D@gw.wicourts.gov
4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
|
|
|
TransactionId old_xmax; /* xmax of the old tuple */
|
|
|
|
TransactionId new_xmax; /* xmax of the new tuple */
|
2012-08-29 01:02:00 +02:00
|
|
|
ItemPointerData newtid; /* new inserted tuple id */
|
Add new wal_level, logical, sufficient for logical decoding.
When wal_level=logical, we'll log columns from the old tuple as
configured by the REPLICA IDENTITY facility added in commit
07cacba983ef79be4a84fcd0e0ca3b5fcb85dd65. This makes it possible
a properly-configured logical replication solution to correctly
follow table updates even if they change the chosen key columns,
or, with REPLICA IDENTITY FULL, even if the table has no key at
all. Note that updates which do not modify the replica identity
column won't log anything extra, making the choice of a good key
(i.e. one that will rarely be changed) important to performance
when wal_level=logical is configured.
Each insert, update, or delete to a catalog table will also log
the CMIN and/or CMAX values of stamped by the current transaction.
This is necessary because logical decoding will require access to
historical snapshots of the catalog in order to decode some data
types, and the CMIN/CMAX values that we may need in order to judge
row visibility may have been overwritten by the time we need them.
Andres Freund, reviewed in various versions by myself, Heikki
Linnakangas, KONDO Mitsumasa, and many others.
2013-12-11 00:33:45 +01:00
|
|
|
uint8 old_infobits_set; /* infomask bits to set on old tuple */
|
|
|
|
uint8 flags;
|
2014-03-12 21:46:04 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If XLOG_HEAP_PREFIX_FROM_OLD or XLOG_HEAP_SUFFIX_FROM_OLD flags are
|
|
|
|
* set, the prefix and/or suffix come next, as one or two uint16s.
|
|
|
|
*
|
|
|
|
* After that, xl_heap_header_len and new tuple data follow. The new
|
|
|
|
* tuple data and length don't include the prefix and suffix, which are
|
|
|
|
* copied from the old tuple on replay. The new tuple data is omitted if
|
|
|
|
* a full-page image of the page was taken (unless the
|
|
|
|
* XLOG_HEAP_CONTAINS_NEW_TUPLE flag is set, in which case it's included
|
|
|
|
* anyway).
|
|
|
|
*
|
|
|
|
* If XLOG_HEAP_CONTAINS_OLD_TUPLE or XLOG_HEAP_CONTAINS_OLD_KEY flags are
|
|
|
|
* set, another xl_heap_header_len struct and tuple data for the old tuple
|
|
|
|
* follows.
|
|
|
|
*/
|
2012-08-29 01:02:00 +02:00
|
|
|
} xl_heap_update;
|
|
|
|
|
Add new wal_level, logical, sufficient for logical decoding.
When wal_level=logical, we'll log columns from the old tuple as
configured by the REPLICA IDENTITY facility added in commit
07cacba983ef79be4a84fcd0e0ca3b5fcb85dd65. This makes it possible
a properly-configured logical replication solution to correctly
follow table updates even if they change the chosen key columns,
or, with REPLICA IDENTITY FULL, even if the table has no key at
all. Note that updates which do not modify the replica identity
column won't log anything extra, making the choice of a good key
(i.e. one that will rarely be changed) important to performance
when wal_level=logical is configured.
Each insert, update, or delete to a catalog table will also log
the CMIN and/or CMAX values of stamped by the current transaction.
This is necessary because logical decoding will require access to
historical snapshots of the catalog in order to decode some data
types, and the CMIN/CMAX values that we may need in order to judge
row visibility may have been overwritten by the time we need them.
Andres Freund, reviewed in various versions by myself, Heikki
Linnakangas, KONDO Mitsumasa, and many others.
2013-12-11 00:33:45 +01:00
|
|
|
#define SizeOfHeapUpdate (offsetof(xl_heap_update, flags) + sizeof(uint8))
|
2012-08-29 01:02:00 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This is what we need to know about vacuum page cleanup/redirect
|
|
|
|
*
|
|
|
|
* The array of OffsetNumbers following the fixed part of the record contains:
|
|
|
|
* * for each redirected item: the item offset, then the offset redirected to
|
|
|
|
* * for each now-dead item: the item offset
|
|
|
|
* * for each now-unused item: the item offset
|
|
|
|
* The total number of OffsetNumbers is therefore 2*nredirected+ndead+nunused.
|
|
|
|
* Note that nunused is not explicitly stored, but may be found by reference
|
|
|
|
* to the total record length.
|
|
|
|
*/
|
|
|
|
typedef struct xl_heap_clean
|
|
|
|
{
|
|
|
|
RelFileNode node;
|
|
|
|
BlockNumber block;
|
|
|
|
TransactionId latestRemovedXid;
|
|
|
|
uint16 nredirected;
|
|
|
|
uint16 ndead;
|
|
|
|
/* OFFSET NUMBERS FOLLOW */
|
|
|
|
} xl_heap_clean;
|
|
|
|
|
|
|
|
#define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Cleanup_info is required in some cases during a lazy VACUUM.
|
|
|
|
* Used for reporting the results of HeapTupleHeaderAdvanceLatestRemovedXid()
|
|
|
|
* see vacuumlazy.c for full explanation
|
|
|
|
*/
|
|
|
|
typedef struct xl_heap_cleanup_info
|
|
|
|
{
|
|
|
|
RelFileNode node;
|
|
|
|
TransactionId latestRemovedXid;
|
|
|
|
} xl_heap_cleanup_info;
|
|
|
|
|
|
|
|
#define SizeOfHeapCleanupInfo (sizeof(xl_heap_cleanup_info))
|
|
|
|
|
|
|
|
/* This is for replacing a page's contents in toto */
|
|
|
|
/* NB: this is used for indexes as well as heaps */
|
|
|
|
typedef struct xl_heap_newpage
|
|
|
|
{
|
|
|
|
RelFileNode node;
|
|
|
|
ForkNumber forknum;
|
|
|
|
BlockNumber blkno; /* location of new page */
|
2013-12-03 23:10:47 +01:00
|
|
|
uint16 hole_offset; /* number of bytes before "hole" */
|
|
|
|
uint16 hole_length; /* number of bytes in "hole" */
|
|
|
|
/* entire page contents (minus the hole) follow at end of record */
|
2012-08-29 01:02:00 +02:00
|
|
|
} xl_heap_newpage;
|
|
|
|
|
2013-12-03 23:10:47 +01:00
|
|
|
#define SizeOfHeapNewpage (offsetof(xl_heap_newpage, hole_length) + sizeof(uint16))
|
2012-08-29 01:02:00 +02:00
|
|
|
|
Improve concurrency of foreign key locking
This patch introduces two additional lock modes for tuples: "SELECT FOR
KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each
other, in contrast with already existing "SELECT FOR SHARE" and "SELECT
FOR UPDATE". UPDATE commands that do not modify the values stored in
the columns that are part of the key of the tuple now grab a SELECT FOR
NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently
with tuple locks of the FOR KEY SHARE variety.
Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this
means the concurrency improvement applies to them, which is the whole
point of this patch.
The added tuple lock semantics require some rejiggering of the multixact
module, so that the locking level that each transaction is holding can
be stored alongside its Xid. Also, multixacts now need to persist
across server restarts and crashes, because they can now represent not
only tuple locks, but also tuple updates. This means we need more
careful tracking of lifetime of pg_multixact SLRU files; since they now
persist longer, we require more infrastructure to figure out when they
can be removed. pg_upgrade also needs to be careful to copy
pg_multixact files over from the old server to the new, or at least part
of multixact.c state, depending on the versions of the old and new
servers.
Tuple time qualification rules (HeapTupleSatisfies routines) need to be
careful not to consider tuples with the "is multi" infomask bit set as
being only locked; they might need to look up MultiXact values (i.e.
possibly do pg_multixact I/O) to find out the Xid that updated a tuple,
whereas they previously were assured to only use information readily
available from the tuple header. This is considered acceptable, because
the extra I/O would involve cases that would previously cause some
commands to block waiting for concurrent transactions to finish.
Another important change is the fact that locking tuples that have
previously been updated causes the future versions to be marked as
locked, too; this is essential for correctness of foreign key checks.
This causes additional WAL-logging, also (there was previously a single
WAL record for a locked tuple; now there are as many as updated copies
of the tuple there exist.)
With all this in place, contention related to tuples being checked by
foreign key rules should be much reduced.
As a bonus, the old behavior that a subtransaction grabbing a stronger
tuple lock than the parent (sub)transaction held on a given tuple and
later aborting caused the weaker lock to be lost, has been fixed.
Many new spec files were added for isolation tester framework, to ensure
overall behavior is sane. There's probably room for several more tests.
There were several reviewers of this patch; in particular, Noah Misch
and Andres Freund spent considerable time in it. Original idea for the
patch came from Simon Riggs, after a problem report by Joel Jacobson.
Most code is from me, with contributions from Marti Raudsepp, Alexander
Shulgin, Noah Misch and Andres Freund.
This patch was discussed in several pgsql-hackers threads; the most
important start at the following message-ids:
AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com
1290721684-sup-3951@alvh.no-ip.org
1294953201-sup-2099@alvh.no-ip.org
1320343602-sup-2290@alvh.no-ip.org
1339690386-sup-8927@alvh.no-ip.org
4FE5FF020200002500048A3D@gw.wicourts.gov
4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
|
|
|
/* flags for infobits_set */
|
|
|
|
#define XLHL_XMAX_IS_MULTI 0x01
|
|
|
|
#define XLHL_XMAX_LOCK_ONLY 0x02
|
|
|
|
#define XLHL_XMAX_EXCL_LOCK 0x04
|
|
|
|
#define XLHL_XMAX_KEYSHR_LOCK 0x08
|
|
|
|
#define XLHL_KEYS_UPDATED 0x10
|
|
|
|
|
2012-08-29 01:02:00 +02:00
|
|
|
/* This is what we need to know about lock */
|
|
|
|
typedef struct xl_heap_lock
|
|
|
|
{
|
|
|
|
xl_heaptid target; /* locked tuple id */
|
|
|
|
TransactionId locking_xid; /* might be a MultiXactId not xid */
|
Improve concurrency of foreign key locking
This patch introduces two additional lock modes for tuples: "SELECT FOR
KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each
other, in contrast with already existing "SELECT FOR SHARE" and "SELECT
FOR UPDATE". UPDATE commands that do not modify the values stored in
the columns that are part of the key of the tuple now grab a SELECT FOR
NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently
with tuple locks of the FOR KEY SHARE variety.
Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this
means the concurrency improvement applies to them, which is the whole
point of this patch.
The added tuple lock semantics require some rejiggering of the multixact
module, so that the locking level that each transaction is holding can
be stored alongside its Xid. Also, multixacts now need to persist
across server restarts and crashes, because they can now represent not
only tuple locks, but also tuple updates. This means we need more
careful tracking of lifetime of pg_multixact SLRU files; since they now
persist longer, we require more infrastructure to figure out when they
can be removed. pg_upgrade also needs to be careful to copy
pg_multixact files over from the old server to the new, or at least part
of multixact.c state, depending on the versions of the old and new
servers.
Tuple time qualification rules (HeapTupleSatisfies routines) need to be
careful not to consider tuples with the "is multi" infomask bit set as
being only locked; they might need to look up MultiXact values (i.e.
possibly do pg_multixact I/O) to find out the Xid that updated a tuple,
whereas they previously were assured to only use information readily
available from the tuple header. This is considered acceptable, because
the extra I/O would involve cases that would previously cause some
commands to block waiting for concurrent transactions to finish.
Another important change is the fact that locking tuples that have
previously been updated causes the future versions to be marked as
locked, too; this is essential for correctness of foreign key checks.
This causes additional WAL-logging, also (there was previously a single
WAL record for a locked tuple; now there are as many as updated copies
of the tuple there exist.)
With all this in place, contention related to tuples being checked by
foreign key rules should be much reduced.
As a bonus, the old behavior that a subtransaction grabbing a stronger
tuple lock than the parent (sub)transaction held on a given tuple and
later aborting caused the weaker lock to be lost, has been fixed.
Many new spec files were added for isolation tester framework, to ensure
overall behavior is sane. There's probably room for several more tests.
There were several reviewers of this patch; in particular, Noah Misch
and Andres Freund spent considerable time in it. Original idea for the
patch came from Simon Riggs, after a problem report by Joel Jacobson.
Most code is from me, with contributions from Marti Raudsepp, Alexander
Shulgin, Noah Misch and Andres Freund.
This patch was discussed in several pgsql-hackers threads; the most
important start at the following message-ids:
AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com
1290721684-sup-3951@alvh.no-ip.org
1294953201-sup-2099@alvh.no-ip.org
1320343602-sup-2290@alvh.no-ip.org
1339690386-sup-8927@alvh.no-ip.org
4FE5FF020200002500048A3D@gw.wicourts.gov
4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
|
|
|
int8 infobits_set; /* infomask and infomask2 bits to set */
|
2012-08-29 01:02:00 +02:00
|
|
|
} xl_heap_lock;
|
|
|
|
|
Improve concurrency of foreign key locking
This patch introduces two additional lock modes for tuples: "SELECT FOR
KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each
other, in contrast with already existing "SELECT FOR SHARE" and "SELECT
FOR UPDATE". UPDATE commands that do not modify the values stored in
the columns that are part of the key of the tuple now grab a SELECT FOR
NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently
with tuple locks of the FOR KEY SHARE variety.
Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this
means the concurrency improvement applies to them, which is the whole
point of this patch.
The added tuple lock semantics require some rejiggering of the multixact
module, so that the locking level that each transaction is holding can
be stored alongside its Xid. Also, multixacts now need to persist
across server restarts and crashes, because they can now represent not
only tuple locks, but also tuple updates. This means we need more
careful tracking of lifetime of pg_multixact SLRU files; since they now
persist longer, we require more infrastructure to figure out when they
can be removed. pg_upgrade also needs to be careful to copy
pg_multixact files over from the old server to the new, or at least part
of multixact.c state, depending on the versions of the old and new
servers.
Tuple time qualification rules (HeapTupleSatisfies routines) need to be
careful not to consider tuples with the "is multi" infomask bit set as
being only locked; they might need to look up MultiXact values (i.e.
possibly do pg_multixact I/O) to find out the Xid that updated a tuple,
whereas they previously were assured to only use information readily
available from the tuple header. This is considered acceptable, because
the extra I/O would involve cases that would previously cause some
commands to block waiting for concurrent transactions to finish.
Another important change is the fact that locking tuples that have
previously been updated causes the future versions to be marked as
locked, too; this is essential for correctness of foreign key checks.
This causes additional WAL-logging, also (there was previously a single
WAL record for a locked tuple; now there are as many as updated copies
of the tuple there exist.)
With all this in place, contention related to tuples being checked by
foreign key rules should be much reduced.
As a bonus, the old behavior that a subtransaction grabbing a stronger
tuple lock than the parent (sub)transaction held on a given tuple and
later aborting caused the weaker lock to be lost, has been fixed.
Many new spec files were added for isolation tester framework, to ensure
overall behavior is sane. There's probably room for several more tests.
There were several reviewers of this patch; in particular, Noah Misch
and Andres Freund spent considerable time in it. Original idea for the
patch came from Simon Riggs, after a problem report by Joel Jacobson.
Most code is from me, with contributions from Marti Raudsepp, Alexander
Shulgin, Noah Misch and Andres Freund.
This patch was discussed in several pgsql-hackers threads; the most
important start at the following message-ids:
AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com
1290721684-sup-3951@alvh.no-ip.org
1294953201-sup-2099@alvh.no-ip.org
1320343602-sup-2290@alvh.no-ip.org
1339690386-sup-8927@alvh.no-ip.org
4FE5FF020200002500048A3D@gw.wicourts.gov
4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
|
|
|
#define SizeOfHeapLock (offsetof(xl_heap_lock, infobits_set) + sizeof(int8))
|
|
|
|
|
|
|
|
/* This is what we need to know about locking an updated version of a row */
|
|
|
|
typedef struct xl_heap_lock_updated
|
|
|
|
{
|
|
|
|
xl_heaptid target;
|
2013-05-29 22:58:43 +02:00
|
|
|
TransactionId xmax;
|
Improve concurrency of foreign key locking
This patch introduces two additional lock modes for tuples: "SELECT FOR
KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each
other, in contrast with already existing "SELECT FOR SHARE" and "SELECT
FOR UPDATE". UPDATE commands that do not modify the values stored in
the columns that are part of the key of the tuple now grab a SELECT FOR
NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently
with tuple locks of the FOR KEY SHARE variety.
Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this
means the concurrency improvement applies to them, which is the whole
point of this patch.
The added tuple lock semantics require some rejiggering of the multixact
module, so that the locking level that each transaction is holding can
be stored alongside its Xid. Also, multixacts now need to persist
across server restarts and crashes, because they can now represent not
only tuple locks, but also tuple updates. This means we need more
careful tracking of lifetime of pg_multixact SLRU files; since they now
persist longer, we require more infrastructure to figure out when they
can be removed. pg_upgrade also needs to be careful to copy
pg_multixact files over from the old server to the new, or at least part
of multixact.c state, depending on the versions of the old and new
servers.
Tuple time qualification rules (HeapTupleSatisfies routines) need to be
careful not to consider tuples with the "is multi" infomask bit set as
being only locked; they might need to look up MultiXact values (i.e.
possibly do pg_multixact I/O) to find out the Xid that updated a tuple,
whereas they previously were assured to only use information readily
available from the tuple header. This is considered acceptable, because
the extra I/O would involve cases that would previously cause some
commands to block waiting for concurrent transactions to finish.
Another important change is the fact that locking tuples that have
previously been updated causes the future versions to be marked as
locked, too; this is essential for correctness of foreign key checks.
This causes additional WAL-logging, also (there was previously a single
WAL record for a locked tuple; now there are as many as updated copies
of the tuple there exist.)
With all this in place, contention related to tuples being checked by
foreign key rules should be much reduced.
As a bonus, the old behavior that a subtransaction grabbing a stronger
tuple lock than the parent (sub)transaction held on a given tuple and
later aborting caused the weaker lock to be lost, has been fixed.
Many new spec files were added for isolation tester framework, to ensure
overall behavior is sane. There's probably room for several more tests.
There were several reviewers of this patch; in particular, Noah Misch
and Andres Freund spent considerable time in it. Original idea for the
patch came from Simon Riggs, after a problem report by Joel Jacobson.
Most code is from me, with contributions from Marti Raudsepp, Alexander
Shulgin, Noah Misch and Andres Freund.
This patch was discussed in several pgsql-hackers threads; the most
important start at the following message-ids:
AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com
1290721684-sup-3951@alvh.no-ip.org
1294953201-sup-2099@alvh.no-ip.org
1320343602-sup-2290@alvh.no-ip.org
1339690386-sup-8927@alvh.no-ip.org
4FE5FF020200002500048A3D@gw.wicourts.gov
4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
|
|
|
uint8 infobits_set;
|
|
|
|
} xl_heap_lock_updated;
|
|
|
|
|
|
|
|
#define SizeOfHeapLockUpdated (offsetof(xl_heap_lock_updated, infobits_set) + sizeof(uint8))
|
2012-08-29 01:02:00 +02:00
|
|
|
|
|
|
|
/* This is what we need to know about in-place update */
|
|
|
|
typedef struct xl_heap_inplace
|
|
|
|
{
|
|
|
|
xl_heaptid target; /* updated tuple id */
|
|
|
|
/* TUPLE DATA FOLLOWS AT END OF STRUCT */
|
|
|
|
} xl_heap_inplace;
|
|
|
|
|
|
|
|
#define SizeOfHeapInplace (offsetof(xl_heap_inplace, target) + SizeOfHeapTid)
|
|
|
|
|
Rework tuple freezing protocol
Tuple freezing was broken in connection to MultiXactIds; commit
8e53ae025de9 tried to fix it, but didn't go far enough. As noted by
Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted
update might cause locks in the multi to go ignored by later
transactions. This is because the code depended on a multixact above
their cutoff point not having any lock-only member older than the cutoff
point for Xids, which is easily defeated in READ COMMITTED transactions.
The fix for this involves creating a new MultiXactId when necessary.
But this cannot be done during WAL replay, and moreover multixact
examination requires using CLOG access routines which are not supposed
to be used during WAL replay either; so tuple freezing cannot be done
with the old freeze WAL record. Therefore, separate the freezing
computation from its execution, and change the WAL record to carry all
necessary information. At WAL replay time, it's easy to re-execute
freezing because we don't need to re-compute the new infomask/Xmax
values but just take them from the WAL record.
While at it, restructure the coding to ensure all page changes occur in
a single critical section without much room for failures. The previous
coding wasn't using a critical section, without any explanation as to
why this was acceptable.
In replication scenarios using the 9.3 branch, standby servers must be
upgraded before their master, so that they are prepared to deal with the
new WAL record once the master is upgraded; failure to do so will cause
WAL replay to die with a PANIC message. Later upgrade of the standby
will allow the process to continue where it left off, so there's no
disruption of the data in the standby in any case. Standbys know how to
deal with the old WAL record, so it's okay to keep the master running
the old code for a while.
In master, the old freeze WAL record is gone, for cleanliness' sake;
there's no compatibility concern there.
Backpatch to 9.3, where the original bug was introduced and where the
previous fix was backpatched.
Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
|
|
|
/*
|
|
|
|
* This struct represents a 'freeze plan', which is what we need to know about
|
|
|
|
* a single tuple being frozen during vacuum.
|
|
|
|
*/
|
2013-12-22 21:49:09 +01:00
|
|
|
/* 0x01 was XLH_FREEZE_XMIN */
|
Rework tuple freezing protocol
Tuple freezing was broken in connection to MultiXactIds; commit
8e53ae025de9 tried to fix it, but didn't go far enough. As noted by
Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted
update might cause locks in the multi to go ignored by later
transactions. This is because the code depended on a multixact above
their cutoff point not having any lock-only member older than the cutoff
point for Xids, which is easily defeated in READ COMMITTED transactions.
The fix for this involves creating a new MultiXactId when necessary.
But this cannot be done during WAL replay, and moreover multixact
examination requires using CLOG access routines which are not supposed
to be used during WAL replay either; so tuple freezing cannot be done
with the old freeze WAL record. Therefore, separate the freezing
computation from its execution, and change the WAL record to carry all
necessary information. At WAL replay time, it's easy to re-execute
freezing because we don't need to re-compute the new infomask/Xmax
values but just take them from the WAL record.
While at it, restructure the coding to ensure all page changes occur in
a single critical section without much room for failures. The previous
coding wasn't using a critical section, without any explanation as to
why this was acceptable.
In replication scenarios using the 9.3 branch, standby servers must be
upgraded before their master, so that they are prepared to deal with the
new WAL record once the master is upgraded; failure to do so will cause
WAL replay to die with a PANIC message. Later upgrade of the standby
will allow the process to continue where it left off, so there's no
disruption of the data in the standby in any case. Standbys know how to
deal with the old WAL record, so it's okay to keep the master running
the old code for a while.
In master, the old freeze WAL record is gone, for cleanliness' sake;
there's no compatibility concern there.
Backpatch to 9.3, where the original bug was introduced and where the
previous fix was backpatched.
Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
|
|
|
#define XLH_FREEZE_XVAC 0x02
|
|
|
|
#define XLH_INVALID_XVAC 0x04
|
|
|
|
|
|
|
|
typedef struct xl_heap_freeze_tuple
|
|
|
|
{
|
|
|
|
TransactionId xmax;
|
|
|
|
OffsetNumber offset;
|
|
|
|
uint16 t_infomask2;
|
|
|
|
uint16 t_infomask;
|
|
|
|
uint8 frzflags;
|
|
|
|
} xl_heap_freeze_tuple;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is what we need to know about a block being frozen during vacuum
|
|
|
|
*/
|
|
|
|
typedef struct xl_heap_freeze_page
|
2012-08-29 01:02:00 +02:00
|
|
|
{
|
|
|
|
RelFileNode node;
|
|
|
|
BlockNumber block;
|
|
|
|
TransactionId cutoff_xid;
|
Rework tuple freezing protocol
Tuple freezing was broken in connection to MultiXactIds; commit
8e53ae025de9 tried to fix it, but didn't go far enough. As noted by
Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted
update might cause locks in the multi to go ignored by later
transactions. This is because the code depended on a multixact above
their cutoff point not having any lock-only member older than the cutoff
point for Xids, which is easily defeated in READ COMMITTED transactions.
The fix for this involves creating a new MultiXactId when necessary.
But this cannot be done during WAL replay, and moreover multixact
examination requires using CLOG access routines which are not supposed
to be used during WAL replay either; so tuple freezing cannot be done
with the old freeze WAL record. Therefore, separate the freezing
computation from its execution, and change the WAL record to carry all
necessary information. At WAL replay time, it's easy to re-execute
freezing because we don't need to re-compute the new infomask/Xmax
values but just take them from the WAL record.
While at it, restructure the coding to ensure all page changes occur in
a single critical section without much room for failures. The previous
coding wasn't using a critical section, without any explanation as to
why this was acceptable.
In replication scenarios using the 9.3 branch, standby servers must be
upgraded before their master, so that they are prepared to deal with the
new WAL record once the master is upgraded; failure to do so will cause
WAL replay to die with a PANIC message. Later upgrade of the standby
will allow the process to continue where it left off, so there's no
disruption of the data in the standby in any case. Standbys know how to
deal with the old WAL record, so it's okay to keep the master running
the old code for a while.
In master, the old freeze WAL record is gone, for cleanliness' sake;
there's no compatibility concern there.
Backpatch to 9.3, where the original bug was introduced and where the
previous fix was backpatched.
Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
|
|
|
uint16 ntuples;
|
|
|
|
xl_heap_freeze_tuple tuples[FLEXIBLE_ARRAY_MEMBER];
|
|
|
|
} xl_heap_freeze_page;
|
2012-08-29 01:02:00 +02:00
|
|
|
|
Rework tuple freezing protocol
Tuple freezing was broken in connection to MultiXactIds; commit
8e53ae025de9 tried to fix it, but didn't go far enough. As noted by
Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted
update might cause locks in the multi to go ignored by later
transactions. This is because the code depended on a multixact above
their cutoff point not having any lock-only member older than the cutoff
point for Xids, which is easily defeated in READ COMMITTED transactions.
The fix for this involves creating a new MultiXactId when necessary.
But this cannot be done during WAL replay, and moreover multixact
examination requires using CLOG access routines which are not supposed
to be used during WAL replay either; so tuple freezing cannot be done
with the old freeze WAL record. Therefore, separate the freezing
computation from its execution, and change the WAL record to carry all
necessary information. At WAL replay time, it's easy to re-execute
freezing because we don't need to re-compute the new infomask/Xmax
values but just take them from the WAL record.
While at it, restructure the coding to ensure all page changes occur in
a single critical section without much room for failures. The previous
coding wasn't using a critical section, without any explanation as to
why this was acceptable.
In replication scenarios using the 9.3 branch, standby servers must be
upgraded before their master, so that they are prepared to deal with the
new WAL record once the master is upgraded; failure to do so will cause
WAL replay to die with a PANIC message. Later upgrade of the standby
will allow the process to continue where it left off, so there's no
disruption of the data in the standby in any case. Standbys know how to
deal with the old WAL record, so it's okay to keep the master running
the old code for a while.
In master, the old freeze WAL record is gone, for cleanliness' sake;
there's no compatibility concern there.
Backpatch to 9.3, where the original bug was introduced and where the
previous fix was backpatched.
Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
|
|
|
#define SizeOfHeapFreezePage offsetof(xl_heap_freeze_page, tuples)
|
2012-08-29 01:02:00 +02:00
|
|
|
|
|
|
|
/* This is what we need to know about setting a visibility map bit */
|
|
|
|
typedef struct xl_heap_visible
|
|
|
|
{
|
|
|
|
RelFileNode node;
|
|
|
|
BlockNumber block;
|
|
|
|
TransactionId cutoff_xid;
|
|
|
|
} xl_heap_visible;
|
|
|
|
|
|
|
|
#define SizeOfHeapVisible (offsetof(xl_heap_visible, cutoff_xid) + sizeof(TransactionId))
|
|
|
|
|
Add new wal_level, logical, sufficient for logical decoding.
When wal_level=logical, we'll log columns from the old tuple as
configured by the REPLICA IDENTITY facility added in commit
07cacba983ef79be4a84fcd0e0ca3b5fcb85dd65. This makes it possible
a properly-configured logical replication solution to correctly
follow table updates even if they change the chosen key columns,
or, with REPLICA IDENTITY FULL, even if the table has no key at
all. Note that updates which do not modify the replica identity
column won't log anything extra, making the choice of a good key
(i.e. one that will rarely be changed) important to performance
when wal_level=logical is configured.
Each insert, update, or delete to a catalog table will also log
the CMIN and/or CMAX values of stamped by the current transaction.
This is necessary because logical decoding will require access to
historical snapshots of the catalog in order to decode some data
types, and the CMIN/CMAX values that we may need in order to judge
row visibility may have been overwritten by the time we need them.
Andres Freund, reviewed in various versions by myself, Heikki
Linnakangas, KONDO Mitsumasa, and many others.
2013-12-11 00:33:45 +01:00
|
|
|
typedef struct xl_heap_new_cid
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* store toplevel xid so we don't have to merge cids from different
|
|
|
|
* transactions
|
|
|
|
*/
|
|
|
|
TransactionId top_xid;
|
|
|
|
CommandId cmin;
|
|
|
|
CommandId cmax;
|
|
|
|
/*
|
|
|
|
* don't really need the combocid since we have the actual values
|
|
|
|
* right in this struct, but the padding makes it free and its
|
|
|
|
* useful for debugging.
|
|
|
|
*/
|
|
|
|
CommandId combocid;
|
|
|
|
/*
|
|
|
|
* Store the relfilenode/ctid pair to facilitate lookups.
|
|
|
|
*/
|
|
|
|
xl_heaptid target;
|
|
|
|
} xl_heap_new_cid;
|
|
|
|
|
2014-04-01 15:23:16 +02:00
|
|
|
#define SizeOfHeapNewCid (offsetof(xl_heap_new_cid, target) + SizeOfHeapTid)
|
|
|
|
|
Introduce logical decoding.
This feature, building on previous commits, allows the write-ahead log
stream to be decoded into a series of logical changes; that is,
inserts, updates, and deletes and the transactions which contain them.
It is capable of handling decoding even across changes to the schema
of the effected tables. The output format is controlled by a
so-called "output plugin"; an example is included. To make use of
this in a real replication system, the output plugin will need to be
modified to produce output in the format appropriate to that system,
and to perform filtering.
Currently, information can be extracted from the logical decoding
system only via SQL; future commits will add the ability to stream
changes via walsender.
Andres Freund, with review and other contributions from many other
people, including Álvaro Herrera, Abhijit Menon-Sen, Peter Gheogegan,
Kevin Grittner, Robert Haas, Heikki Linnakangas, Fujii Masao, Abhijit
Menon-Sen, Michael Paquier, Simon Riggs, Craig Ringer, and Steve
Singer.
2014-03-03 22:32:18 +01:00
|
|
|
/* logical rewrite xlog record header */
|
|
|
|
typedef struct xl_heap_rewrite_mapping
|
|
|
|
{
|
|
|
|
TransactionId mapped_xid; /* xid that might need to see the row */
|
|
|
|
Oid mapped_db; /* DbOid or InvalidOid for shared rels */
|
|
|
|
Oid mapped_rel; /* Oid of the mapped relation */
|
|
|
|
off_t offset; /* How far have we written so far */
|
|
|
|
uint32 num_mappings; /* Number of in-memory mappings */
|
|
|
|
XLogRecPtr start_lsn; /* Insert LSN at begin of rewrite */
|
|
|
|
} xl_heap_rewrite_mapping;
|
|
|
|
|
2012-08-29 01:02:00 +02:00
|
|
|
extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
|
|
|
|
TransactionId *latestRemovedXid);
|
|
|
|
|
|
|
|
extern void heap_redo(XLogRecPtr lsn, XLogRecord *rptr);
|
|
|
|
extern void heap_desc(StringInfo buf, uint8 xl_info, char *rec);
|
|
|
|
extern void heap2_redo(XLogRecPtr lsn, XLogRecord *rptr);
|
|
|
|
extern void heap2_desc(StringInfo buf, uint8 xl_info, char *rec);
|
Introduce logical decoding.
This feature, building on previous commits, allows the write-ahead log
stream to be decoded into a series of logical changes; that is,
inserts, updates, and deletes and the transactions which contain them.
It is capable of handling decoding even across changes to the schema
of the effected tables. The output format is controlled by a
so-called "output plugin"; an example is included. To make use of
this in a real replication system, the output plugin will need to be
modified to produce output in the format appropriate to that system,
and to perform filtering.
Currently, information can be extracted from the logical decoding
system only via SQL; future commits will add the ability to stream
changes via walsender.
Andres Freund, with review and other contributions from many other
people, including Álvaro Herrera, Abhijit Menon-Sen, Peter Gheogegan,
Kevin Grittner, Robert Haas, Heikki Linnakangas, Fujii Masao, Abhijit
Menon-Sen, Michael Paquier, Simon Riggs, Craig Ringer, and Steve
Singer.
2014-03-03 22:32:18 +01:00
|
|
|
extern void heap_xlog_logical_rewrite(XLogRecPtr lsn, XLogRecord *r);
|
2012-08-29 01:02:00 +02:00
|
|
|
|
|
|
|
extern XLogRecPtr log_heap_cleanup_info(RelFileNode rnode,
|
|
|
|
TransactionId latestRemovedXid);
|
|
|
|
extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
|
|
|
|
OffsetNumber *redirected, int nredirected,
|
|
|
|
OffsetNumber *nowdead, int ndead,
|
|
|
|
OffsetNumber *nowunused, int nunused,
|
|
|
|
TransactionId latestRemovedXid);
|
|
|
|
extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
|
Rework tuple freezing protocol
Tuple freezing was broken in connection to MultiXactIds; commit
8e53ae025de9 tried to fix it, but didn't go far enough. As noted by
Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted
update might cause locks in the multi to go ignored by later
transactions. This is because the code depended on a multixact above
their cutoff point not having any lock-only member older than the cutoff
point for Xids, which is easily defeated in READ COMMITTED transactions.
The fix for this involves creating a new MultiXactId when necessary.
But this cannot be done during WAL replay, and moreover multixact
examination requires using CLOG access routines which are not supposed
to be used during WAL replay either; so tuple freezing cannot be done
with the old freeze WAL record. Therefore, separate the freezing
computation from its execution, and change the WAL record to carry all
necessary information. At WAL replay time, it's easy to re-execute
freezing because we don't need to re-compute the new infomask/Xmax
values but just take them from the WAL record.
While at it, restructure the coding to ensure all page changes occur in
a single critical section without much room for failures. The previous
coding wasn't using a critical section, without any explanation as to
why this was acceptable.
In replication scenarios using the 9.3 branch, standby servers must be
upgraded before their master, so that they are prepared to deal with the
new WAL record once the master is upgraded; failure to do so will cause
WAL replay to die with a PANIC message. Later upgrade of the standby
will allow the process to continue where it left off, so there's no
disruption of the data in the standby in any case. Standbys know how to
deal with the old WAL record, so it's okay to keep the master running
the old code for a while.
In master, the old freeze WAL record is gone, for cleanliness' sake;
there's no compatibility concern there.
Backpatch to 9.3, where the original bug was introduced and where the
previous fix was backpatched.
Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
|
|
|
TransactionId cutoff_xid, xl_heap_freeze_tuple *tuples,
|
|
|
|
int ntuples);
|
|
|
|
extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple,
|
|
|
|
TransactionId cutoff_xid,
|
|
|
|
TransactionId cutoff_multi,
|
|
|
|
xl_heap_freeze_tuple *frz);
|
|
|
|
extern void heap_execute_freeze_tuple(HeapTupleHeader tuple,
|
|
|
|
xl_heap_freeze_tuple *xlrec_tp);
|
2013-03-22 14:54:07 +01:00
|
|
|
extern XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer,
|
2012-08-29 01:02:00 +02:00
|
|
|
Buffer vm_buffer, TransactionId cutoff_xid);
|
|
|
|
extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum,
|
2013-12-03 23:10:47 +01:00
|
|
|
BlockNumber blk, Page page, bool page_std);
|
|
|
|
extern XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std);
|
2012-08-29 01:02:00 +02:00
|
|
|
|
|
|
|
#endif /* HEAPAM_XLOG_H */
|